PR sanitizer/80067
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob3d1d7e7f918b98a045b17276369ab6d5519891a3
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
54 /* Return true if load- or store-lanes optab OPTAB is implemented for
55 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
57 static bool
58 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
59 tree vectype, unsigned HOST_WIDE_INT count)
61 machine_mode mode, array_mode;
62 bool limit_p;
64 mode = TYPE_MODE (vectype);
65 limit_p = !targetm.array_mode_supported_p (mode, count);
66 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
67 MODE_INT, limit_p);
69 if (array_mode == BLKmode)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
74 GET_MODE_NAME (mode), count);
75 return false;
78 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
80 if (dump_enabled_p ())
81 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
82 "cannot use %s<%s><%s>\n", name,
83 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
84 return false;
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_NOTE, vect_location,
89 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
90 GET_MODE_NAME (mode));
92 return true;
96 /* Return the smallest scalar part of STMT.
97 This is used to determine the vectype of the stmt. We generally set the
98 vectype according to the type of the result (lhs). For stmts whose
99 result-type is different than the type of the arguments (e.g., demotion,
100 promotion), vectype will be reset appropriately (later). Note that we have
101 to visit the smallest datatype in this function, because that determines the
102 VF. If the smallest datatype in the loop is present only as the rhs of a
103 promotion operation - we'd miss it.
104 Such a case, where a variable of this datatype does not appear in the lhs
105 anywhere in the loop, can only occur if it's an invariant: e.g.:
106 'int_x = (int) short_inv', which we'd expect to have been optimized away by
107 invariant motion. However, we cannot rely on invariant motion to always
108 take invariants out of the loop, and so in the case of promotion we also
109 have to check the rhs.
110 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
111 types. */
113 tree
114 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
115 HOST_WIDE_INT *rhs_size_unit)
117 tree scalar_type = gimple_expr_type (stmt);
118 HOST_WIDE_INT lhs, rhs;
120 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
122 if (is_gimple_assign (stmt)
123 && (gimple_assign_cast_p (stmt)
124 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
125 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
126 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
128 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
130 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
131 if (rhs < lhs)
132 scalar_type = rhs_type;
135 *lhs_size_unit = lhs;
136 *rhs_size_unit = rhs;
137 return scalar_type;
141 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
142 tested at run-time. Return TRUE if DDR was successfully inserted.
143 Return false if versioning is not supported. */
145 static bool
146 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
148 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
150 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
151 return false;
153 if (dump_enabled_p ())
155 dump_printf_loc (MSG_NOTE, vect_location,
156 "mark for run-time aliasing test between ");
157 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
158 dump_printf (MSG_NOTE, " and ");
159 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
160 dump_printf (MSG_NOTE, "\n");
163 if (optimize_loop_nest_for_size_p (loop))
165 if (dump_enabled_p ())
166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
167 "versioning not supported when optimizing"
168 " for size.\n");
169 return false;
172 /* FORNOW: We don't support versioning with outer-loop vectorization. */
173 if (loop->inner)
175 if (dump_enabled_p ())
176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
177 "versioning not yet supported for outer-loops.\n");
178 return false;
181 /* FORNOW: We don't support creating runtime alias tests for non-constant
182 step. */
183 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
184 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
186 if (dump_enabled_p ())
187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
188 "versioning not yet supported for non-constant "
189 "step\n");
190 return false;
193 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
194 return true;
198 /* Function vect_analyze_data_ref_dependence.
200 Return TRUE if there (might) exist a dependence between a memory-reference
201 DRA and a memory-reference DRB. When versioning for alias may check a
202 dependence at run-time, return FALSE. Adjust *MAX_VF according to
203 the data dependence. */
205 static bool
206 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
207 loop_vec_info loop_vinfo, int *max_vf)
209 unsigned int i;
210 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
211 struct data_reference *dra = DDR_A (ddr);
212 struct data_reference *drb = DDR_B (ddr);
213 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
214 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
215 lambda_vector dist_v;
216 unsigned int loop_depth;
218 /* In loop analysis all data references should be vectorizable. */
219 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
220 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
221 gcc_unreachable ();
223 /* Independent data accesses. */
224 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
225 return false;
227 if (dra == drb
228 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
229 return false;
231 /* We do not have to consider dependences between accesses that belong
232 to the same group. */
233 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
234 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
235 return false;
237 /* Even if we have an anti-dependence then, as the vectorized loop covers at
238 least two scalar iterations, there is always also a true dependence.
239 As the vectorizer does not re-order loads and stores we can ignore
240 the anti-dependence if TBAA can disambiguate both DRs similar to the
241 case with known negative distance anti-dependences (positive
242 distance anti-dependences would violate TBAA constraints). */
243 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
244 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
245 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
246 get_alias_set (DR_REF (drb))))
247 return false;
249 /* Unknown data dependence. */
250 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
252 /* If user asserted safelen consecutive iterations can be
253 executed concurrently, assume independence. */
254 if (loop->safelen >= 2)
256 if (loop->safelen < *max_vf)
257 *max_vf = loop->safelen;
258 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
259 return false;
262 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
263 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
265 if (dump_enabled_p ())
267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
268 "versioning for alias not supported for: "
269 "can't determine dependence between ");
270 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
271 DR_REF (dra));
272 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
273 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
274 DR_REF (drb));
275 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
277 return true;
280 if (dump_enabled_p ())
282 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
283 "versioning for alias required: "
284 "can't determine dependence between ");
285 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
286 DR_REF (dra));
287 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
288 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
289 DR_REF (drb));
290 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
293 /* Add to list of ddrs that need to be tested at run-time. */
294 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
297 /* Known data dependence. */
298 if (DDR_NUM_DIST_VECTS (ddr) == 0)
300 /* If user asserted safelen consecutive iterations can be
301 executed concurrently, assume independence. */
302 if (loop->safelen >= 2)
304 if (loop->safelen < *max_vf)
305 *max_vf = loop->safelen;
306 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
307 return false;
310 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
311 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
313 if (dump_enabled_p ())
315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
316 "versioning for alias not supported for: "
317 "bad dist vector for ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
319 DR_REF (dra));
320 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
321 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
322 DR_REF (drb));
323 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
325 return true;
328 if (dump_enabled_p ())
330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
331 "versioning for alias required: "
332 "bad dist vector for ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
334 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
335 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
336 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
338 /* Add to list of ddrs that need to be tested at run-time. */
339 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
342 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
343 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
345 int dist = dist_v[loop_depth];
347 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location,
349 "dependence distance = %d.\n", dist);
351 if (dist == 0)
353 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE, vect_location,
356 "dependence distance == 0 between ");
357 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
358 dump_printf (MSG_NOTE, " and ");
359 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
360 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
363 /* When we perform grouped accesses and perform implicit CSE
364 by detecting equal accesses and doing disambiguation with
365 runtime alias tests like for
366 .. = a[i];
367 .. = a[i+1];
368 a[i] = ..;
369 a[i+1] = ..;
370 *p = ..;
371 .. = a[i];
372 .. = a[i+1];
373 where we will end up loading { a[i], a[i+1] } once, make
374 sure that inserting group loads before the first load and
375 stores after the last store will do the right thing.
376 Similar for groups like
377 a[i] = ...;
378 ... = a[i];
379 a[i+1] = ...;
380 where loads from the group interleave with the store. */
381 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
382 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
384 gimple *earlier_stmt;
385 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
386 if (DR_IS_WRITE
387 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
391 "READ_WRITE dependence in interleaving."
392 "\n");
393 return true;
397 continue;
400 if (dist > 0 && DDR_REVERSED_P (ddr))
402 /* If DDR_REVERSED_P the order of the data-refs in DDR was
403 reversed (to make distance vector positive), and the actual
404 distance is negative. */
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "dependence distance negative.\n");
408 /* Record a negative dependence distance to later limit the
409 amount of stmt copying / unrolling we can perform.
410 Only need to handle read-after-write dependence. */
411 if (DR_IS_READ (drb)
412 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
413 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
414 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
415 continue;
418 if (abs (dist) >= 2
419 && abs (dist) < *max_vf)
421 /* The dependence distance requires reduction of the maximal
422 vectorization factor. */
423 *max_vf = abs (dist);
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_NOTE, vect_location,
426 "adjusting maximal vectorization factor to %i\n",
427 *max_vf);
430 if (abs (dist) >= *max_vf)
432 /* Dependence distance does not create dependence, as far as
433 vectorization is concerned, in this case. */
434 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE, vect_location,
436 "dependence distance >= VF.\n");
437 continue;
440 if (dump_enabled_p ())
442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
443 "not vectorized, possible dependence "
444 "between data-refs ");
445 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
446 dump_printf (MSG_NOTE, " and ");
447 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
448 dump_printf (MSG_NOTE, "\n");
451 return true;
454 return false;
457 /* Function vect_analyze_data_ref_dependences.
459 Examine all the data references in the loop, and make sure there do not
460 exist any data dependences between them. Set *MAX_VF according to
461 the maximum vectorization factor the data dependences allow. */
463 bool
464 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
466 unsigned int i;
467 struct data_dependence_relation *ddr;
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE, vect_location,
471 "=== vect_analyze_data_ref_dependences ===\n");
473 LOOP_VINFO_DDRS (loop_vinfo)
474 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
475 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
476 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
477 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
478 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
479 &LOOP_VINFO_DDRS (loop_vinfo),
480 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
481 return false;
483 /* For epilogues we either have no aliases or alias versioning
484 was applied to original loop. Therefore we may just get max_vf
485 using VF of original loop. */
486 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
487 *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
488 else
489 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
490 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
491 return false;
493 return true;
497 /* Function vect_slp_analyze_data_ref_dependence.
499 Return TRUE if there (might) exist a dependence between a memory-reference
500 DRA and a memory-reference DRB. When versioning for alias may check a
501 dependence at run-time, return FALSE. Adjust *MAX_VF according to
502 the data dependence. */
504 static bool
505 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
507 struct data_reference *dra = DDR_A (ddr);
508 struct data_reference *drb = DDR_B (ddr);
510 /* We need to check dependences of statements marked as unvectorizable
511 as well, they still can prohibit vectorization. */
513 /* Independent data accesses. */
514 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
515 return false;
517 if (dra == drb)
518 return false;
520 /* Read-read is OK. */
521 if (DR_IS_READ (dra) && DR_IS_READ (drb))
522 return false;
524 /* If dra and drb are part of the same interleaving chain consider
525 them independent. */
526 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
527 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
528 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
529 return false;
531 /* Unknown data dependence. */
532 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
534 if (dump_enabled_p ())
536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
537 "can't determine dependence between ");
538 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
539 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
541 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
544 else if (dump_enabled_p ())
546 dump_printf_loc (MSG_NOTE, vect_location,
547 "determined dependence between ");
548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
549 dump_printf (MSG_NOTE, " and ");
550 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
551 dump_printf (MSG_NOTE, "\n");
554 return true;
558 /* Analyze dependences involved in the transform of SLP NODE. STORES
559 contain the vector of scalar stores of this instance if we are
560 disambiguating the loads. */
562 static bool
563 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
564 vec<gimple *> stores, gimple *last_store)
566 /* This walks over all stmts involved in the SLP load/store done
567 in NODE verifying we can sink them up to the last stmt in the
568 group. */
569 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
570 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
572 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
573 if (access == last_access)
574 continue;
575 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
576 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
577 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
579 gimple *stmt = gsi_stmt (gsi);
580 if (! gimple_vuse (stmt)
581 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
582 continue;
584 /* If we couldn't record a (single) data reference for this
585 stmt we have to give up. */
586 /* ??? Here and below if dependence analysis fails we can resort
587 to the alias oracle which can handle more kinds of stmts. */
588 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
589 if (!dr_b)
590 return false;
592 bool dependent = false;
593 /* If we run into a store of this same instance (we've just
594 marked those) then delay dependence checking until we run
595 into the last store because this is where it will have
596 been sunk to (and we verify if we can do that as well). */
597 if (gimple_visited_p (stmt))
599 if (stmt != last_store)
600 continue;
601 unsigned i;
602 gimple *store;
603 FOR_EACH_VEC_ELT (stores, i, store)
605 data_reference *store_dr
606 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
607 ddr_p ddr = initialize_data_dependence_relation
608 (dr_a, store_dr, vNULL);
609 dependent = vect_slp_analyze_data_ref_dependence (ddr);
610 free_dependence_relation (ddr);
611 if (dependent)
612 break;
615 else
617 ddr_p ddr = initialize_data_dependence_relation (dr_a,
618 dr_b, vNULL);
619 dependent = vect_slp_analyze_data_ref_dependence (ddr);
620 free_dependence_relation (ddr);
622 if (dependent)
623 return false;
626 return true;
630 /* Function vect_analyze_data_ref_dependences.
632 Examine all the data references in the basic-block, and make sure there
633 do not exist any data dependences between them. Set *MAX_VF according to
634 the maximum vectorization factor the data dependences allow. */
636 bool
637 vect_slp_analyze_instance_dependence (slp_instance instance)
639 if (dump_enabled_p ())
640 dump_printf_loc (MSG_NOTE, vect_location,
641 "=== vect_slp_analyze_instance_dependence ===\n");
643 /* The stores of this instance are at the root of the SLP tree. */
644 slp_tree store = SLP_INSTANCE_TREE (instance);
645 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
646 store = NULL;
648 /* Verify we can sink stores to the vectorized stmt insert location. */
649 gimple *last_store = NULL;
650 if (store)
652 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
653 return false;
655 /* Mark stores in this instance and remember the last one. */
656 last_store = vect_find_last_scalar_stmt_in_slp (store);
657 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
658 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
661 bool res = true;
663 /* Verify we can sink loads to the vectorized stmt insert location,
664 special-casing stores of this instance. */
665 slp_tree load;
666 unsigned int i;
667 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
668 if (! vect_slp_analyze_node_dependences (instance, load,
669 store
670 ? SLP_TREE_SCALAR_STMTS (store)
671 : vNULL, last_store))
673 res = false;
674 break;
677 /* Unset the visited flag. */
678 if (store)
679 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
680 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
682 return res;
685 /* Function vect_compute_data_ref_alignment
687 Compute the misalignment of the data reference DR.
689 Output:
690 1. If during the misalignment computation it is found that the data reference
691 cannot be vectorized then false is returned.
692 2. DR_MISALIGNMENT (DR) is defined.
694 FOR NOW: No analysis is actually performed. Misalignment is calculated
695 only for trivial cases. TODO. */
697 bool
698 vect_compute_data_ref_alignment (struct data_reference *dr)
700 gimple *stmt = DR_STMT (dr);
701 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
702 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
703 struct loop *loop = NULL;
704 tree ref = DR_REF (dr);
705 tree vectype;
706 tree base, base_addr;
707 tree misalign = NULL_TREE;
708 tree aligned_to;
709 tree step;
710 unsigned HOST_WIDE_INT alignment;
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE, vect_location,
714 "vect_compute_data_ref_alignment:\n");
716 if (loop_vinfo)
717 loop = LOOP_VINFO_LOOP (loop_vinfo);
719 /* Initialize misalignment to unknown. */
720 SET_DR_MISALIGNMENT (dr, -1);
722 if (tree_fits_shwi_p (DR_STEP (dr)))
723 misalign = DR_INIT (dr);
724 aligned_to = DR_ALIGNED_TO (dr);
725 base_addr = DR_BASE_ADDRESS (dr);
726 vectype = STMT_VINFO_VECTYPE (stmt_info);
728 /* In case the dataref is in an inner-loop of the loop that is being
729 vectorized (LOOP), we use the base and misalignment information
730 relative to the outer-loop (LOOP). This is ok only if the misalignment
731 stays the same throughout the execution of the inner-loop, which is why
732 we have to check that the stride of the dataref in the inner-loop evenly
733 divides by the vector size. */
734 if (loop && nested_in_vect_loop_p (loop, stmt))
736 tree step = DR_STEP (dr);
738 if (tree_fits_shwi_p (step)
739 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_NOTE, vect_location,
743 "inner step divides the vector-size.\n");
744 misalign = STMT_VINFO_DR_INIT (stmt_info);
745 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
746 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
748 else
750 if (dump_enabled_p ())
751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
752 "inner step doesn't divide the vector-size.\n");
753 misalign = NULL_TREE;
757 /* Similarly we can only use base and misalignment information relative to
758 an innermost loop if the misalignment stays the same throughout the
759 execution of the loop. As above, this is the case if the stride of
760 the dataref evenly divides by the vector size. */
761 else
763 tree step = DR_STEP (dr);
764 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
766 if (tree_fits_shwi_p (step)
767 && ((tree_to_shwi (step) * vf)
768 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
770 if (dump_enabled_p ())
771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
772 "step doesn't divide the vector-size.\n");
773 misalign = NULL_TREE;
777 /* To look at alignment of the base we have to preserve an inner MEM_REF
778 as that carries alignment information of the actual access. */
779 base = ref;
780 while (handled_component_p (base))
781 base = TREE_OPERAND (base, 0);
782 unsigned int base_alignment = 0;
783 unsigned HOST_WIDE_INT base_bitpos;
784 get_object_alignment_1 (base, &base_alignment, &base_bitpos);
785 /* As data-ref analysis strips the MEM_REF down to its base operand
786 to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to
787 adjust things to make base_alignment valid as the alignment of
788 DR_BASE_ADDRESS. */
789 if (TREE_CODE (base) == MEM_REF)
791 /* Note all this only works if DR_BASE_ADDRESS is the same as
792 MEM_REF operand zero, otherwise DR/SCEV analysis might have factored
793 in other offsets. We need to rework DR to compute the alingment
794 of DR_BASE_ADDRESS as long as all information is still available. */
795 if (operand_equal_p (TREE_OPERAND (base, 0), base_addr, 0))
797 base_bitpos -= mem_ref_offset (base).to_short_addr () * BITS_PER_UNIT;
798 base_bitpos &= (base_alignment - 1);
800 else
801 base_bitpos = BITS_PER_UNIT;
803 if (base_bitpos != 0)
804 base_alignment = base_bitpos & -base_bitpos;
805 /* Also look at the alignment of the base address DR analysis
806 computed. */
807 unsigned int base_addr_alignment = get_pointer_alignment (base_addr);
808 if (base_addr_alignment > base_alignment)
809 base_alignment = base_addr_alignment;
811 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
812 DR_VECT_AUX (dr)->base_element_aligned = true;
814 alignment = TYPE_ALIGN_UNIT (vectype);
816 if ((compare_tree_int (aligned_to, alignment) < 0)
817 || !misalign)
819 if (dump_enabled_p ())
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 "Unknown alignment for access: ");
823 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
824 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
826 return true;
829 if (base_alignment < TYPE_ALIGN (vectype))
831 base = base_addr;
832 if (TREE_CODE (base) == ADDR_EXPR)
833 base = TREE_OPERAND (base, 0);
834 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
836 if (dump_enabled_p ())
838 dump_printf_loc (MSG_NOTE, vect_location,
839 "can't force alignment of ref: ");
840 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
841 dump_printf (MSG_NOTE, "\n");
843 return true;
846 if (DECL_USER_ALIGN (base))
848 if (dump_enabled_p ())
850 dump_printf_loc (MSG_NOTE, vect_location,
851 "not forcing alignment of user-aligned "
852 "variable: ");
853 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
854 dump_printf (MSG_NOTE, "\n");
856 return true;
859 /* Force the alignment of the decl.
860 NOTE: This is the only change to the code we make during
861 the analysis phase, before deciding to vectorize the loop. */
862 if (dump_enabled_p ())
864 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
865 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
866 dump_printf (MSG_NOTE, "\n");
869 DR_VECT_AUX (dr)->base_decl = base;
870 DR_VECT_AUX (dr)->base_misaligned = true;
871 DR_VECT_AUX (dr)->base_element_aligned = true;
874 if (loop && nested_in_vect_loop_p (loop, stmt))
875 step = STMT_VINFO_DR_STEP (stmt_info);
876 else
877 step = DR_STEP (dr);
878 /* If this is a backward running DR then first access in the larger
879 vectype actually is N-1 elements before the address in the DR.
880 Adjust misalign accordingly. */
881 if (tree_int_cst_sgn (step) < 0)
883 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
884 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
885 otherwise we wouldn't be here. */
886 offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
887 /* PLUS because STEP was negative. */
888 misalign = size_binop (PLUS_EXPR, misalign, offset);
891 SET_DR_MISALIGNMENT (dr,
892 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
894 if (dump_enabled_p ())
896 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
897 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
898 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
899 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
902 return true;
906 /* Function vect_update_misalignment_for_peel
908 DR - the data reference whose misalignment is to be adjusted.
909 DR_PEEL - the data reference whose misalignment is being made
910 zero in the vector loop by the peel.
911 NPEEL - the number of iterations in the peel loop if the misalignment
912 of DR_PEEL is known at compile time. */
914 static void
915 vect_update_misalignment_for_peel (struct data_reference *dr,
916 struct data_reference *dr_peel, int npeel)
918 unsigned int i;
919 vec<dr_p> same_align_drs;
920 struct data_reference *current_dr;
921 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
922 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
923 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
924 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
926 /* For interleaved data accesses the step in the loop must be multiplied by
927 the size of the interleaving group. */
928 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
929 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
930 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
931 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
933 /* It can be assumed that the data refs with the same alignment as dr_peel
934 are aligned in the vector loop. */
935 same_align_drs
936 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
937 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
939 if (current_dr != dr)
940 continue;
941 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
942 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
943 SET_DR_MISALIGNMENT (dr, 0);
944 return;
947 if (known_alignment_for_access_p (dr)
948 && known_alignment_for_access_p (dr_peel))
950 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
951 int misal = DR_MISALIGNMENT (dr);
952 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
953 misal += negative ? -npeel * dr_size : npeel * dr_size;
954 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
955 SET_DR_MISALIGNMENT (dr, misal);
956 return;
959 if (dump_enabled_p ())
960 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
961 SET_DR_MISALIGNMENT (dr, -1);
965 /* Function verify_data_ref_alignment
967 Return TRUE if DR can be handled with respect to alignment. */
969 static bool
970 verify_data_ref_alignment (data_reference_p dr)
972 enum dr_alignment_support supportable_dr_alignment
973 = vect_supportable_dr_alignment (dr, false);
974 if (!supportable_dr_alignment)
976 if (dump_enabled_p ())
978 if (DR_IS_READ (dr))
979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
980 "not vectorized: unsupported unaligned load.");
981 else
982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
983 "not vectorized: unsupported unaligned "
984 "store.");
986 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
987 DR_REF (dr));
988 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
990 return false;
993 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
994 dump_printf_loc (MSG_NOTE, vect_location,
995 "Vectorizing an unaligned access.\n");
997 return true;
1000 /* Function vect_verify_datarefs_alignment
1002 Return TRUE if all data references in the loop can be
1003 handled with respect to alignment. */
1005 bool
1006 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1008 vec<data_reference_p> datarefs = vinfo->datarefs;
1009 struct data_reference *dr;
1010 unsigned int i;
1012 FOR_EACH_VEC_ELT (datarefs, i, dr)
1014 gimple *stmt = DR_STMT (dr);
1015 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1017 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1018 continue;
1020 /* For interleaving, only the alignment of the first access matters. */
1021 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1022 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1023 continue;
1025 /* Strided accesses perform only component accesses, alignment is
1026 irrelevant for them. */
1027 if (STMT_VINFO_STRIDED_P (stmt_info)
1028 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1029 continue;
1031 if (! verify_data_ref_alignment (dr))
1032 return false;
1035 return true;
1038 /* Given an memory reference EXP return whether its alignment is less
1039 than its size. */
1041 static bool
1042 not_size_aligned (tree exp)
1044 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1045 return true;
1047 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1048 > get_object_alignment (exp));
1051 /* Function vector_alignment_reachable_p
1053 Return true if vector alignment for DR is reachable by peeling
1054 a few loop iterations. Return false otherwise. */
1056 static bool
1057 vector_alignment_reachable_p (struct data_reference *dr)
1059 gimple *stmt = DR_STMT (dr);
1060 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1061 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1063 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1065 /* For interleaved access we peel only if number of iterations in
1066 the prolog loop ({VF - misalignment}), is a multiple of the
1067 number of the interleaved accesses. */
1068 int elem_size, mis_in_elements;
1069 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1071 /* FORNOW: handle only known alignment. */
1072 if (!known_alignment_for_access_p (dr))
1073 return false;
1075 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1076 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1078 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1079 return false;
1082 /* If misalignment is known at the compile time then allow peeling
1083 only if natural alignment is reachable through peeling. */
1084 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1086 HOST_WIDE_INT elmsize =
1087 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1088 if (dump_enabled_p ())
1090 dump_printf_loc (MSG_NOTE, vect_location,
1091 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1092 dump_printf (MSG_NOTE,
1093 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1095 if (DR_MISALIGNMENT (dr) % elmsize)
1097 if (dump_enabled_p ())
1098 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1099 "data size does not divide the misalignment.\n");
1100 return false;
1104 if (!known_alignment_for_access_p (dr))
1106 tree type = TREE_TYPE (DR_REF (dr));
1107 bool is_packed = not_size_aligned (DR_REF (dr));
1108 if (dump_enabled_p ())
1109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1110 "Unknown misalignment, %snaturally aligned\n",
1111 is_packed ? "not " : "");
1112 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1115 return true;
1119 /* Calculate the cost of the memory access represented by DR. */
1121 static void
1122 vect_get_data_access_cost (struct data_reference *dr,
1123 unsigned int *inside_cost,
1124 unsigned int *outside_cost,
1125 stmt_vector_for_cost *body_cost_vec)
1127 gimple *stmt = DR_STMT (dr);
1128 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1129 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1130 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1131 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1132 int ncopies = vf / nunits;
1134 if (DR_IS_READ (dr))
1135 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1136 NULL, body_cost_vec, false);
1137 else
1138 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1140 if (dump_enabled_p ())
1141 dump_printf_loc (MSG_NOTE, vect_location,
1142 "vect_get_data_access_cost: inside_cost = %d, "
1143 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1147 typedef struct _vect_peel_info
1149 struct data_reference *dr;
1150 int npeel;
1151 unsigned int count;
1152 } *vect_peel_info;
1154 typedef struct _vect_peel_extended_info
1156 struct _vect_peel_info peel_info;
1157 unsigned int inside_cost;
1158 unsigned int outside_cost;
1159 stmt_vector_for_cost body_cost_vec;
1160 } *vect_peel_extended_info;
1163 /* Peeling hashtable helpers. */
1165 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1167 static inline hashval_t hash (const _vect_peel_info *);
1168 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1171 inline hashval_t
1172 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1174 return (hashval_t) peel_info->npeel;
1177 inline bool
1178 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1180 return (a->npeel == b->npeel);
1184 /* Insert DR into peeling hash table with NPEEL as key. */
1186 static void
1187 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1188 loop_vec_info loop_vinfo, struct data_reference *dr,
1189 int npeel)
1191 struct _vect_peel_info elem, *slot;
1192 _vect_peel_info **new_slot;
1193 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1195 elem.npeel = npeel;
1196 slot = peeling_htab->find (&elem);
1197 if (slot)
1198 slot->count++;
1199 else
1201 slot = XNEW (struct _vect_peel_info);
1202 slot->npeel = npeel;
1203 slot->dr = dr;
1204 slot->count = 1;
1205 new_slot = peeling_htab->find_slot (slot, INSERT);
1206 *new_slot = slot;
1209 if (!supportable_dr_alignment
1210 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1211 slot->count += VECT_MAX_COST;
1215 /* Traverse peeling hash table to find peeling option that aligns maximum
1216 number of data accesses. */
1219 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1220 _vect_peel_extended_info *max)
1222 vect_peel_info elem = *slot;
1224 if (elem->count > max->peel_info.count
1225 || (elem->count == max->peel_info.count
1226 && max->peel_info.npeel > elem->npeel))
1228 max->peel_info.npeel = elem->npeel;
1229 max->peel_info.count = elem->count;
1230 max->peel_info.dr = elem->dr;
1233 return 1;
1237 /* Traverse peeling hash table and calculate cost for each peeling option.
1238 Find the one with the lowest cost. */
1241 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1242 _vect_peel_extended_info *min)
1244 vect_peel_info elem = *slot;
1245 int save_misalignment, dummy;
1246 unsigned int inside_cost = 0, outside_cost = 0, i;
1247 gimple *stmt = DR_STMT (elem->dr);
1248 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1249 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1250 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1251 struct data_reference *dr;
1252 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1254 prologue_cost_vec.create (2);
1255 body_cost_vec.create (2);
1256 epilogue_cost_vec.create (2);
1258 FOR_EACH_VEC_ELT (datarefs, i, dr)
1260 stmt = DR_STMT (dr);
1261 stmt_info = vinfo_for_stmt (stmt);
1262 /* For interleaving, only the alignment of the first access
1263 matters. */
1264 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1265 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1266 continue;
1268 /* Strided accesses perform only component accesses, alignment is
1269 irrelevant for them. */
1270 if (STMT_VINFO_STRIDED_P (stmt_info)
1271 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1272 continue;
1274 save_misalignment = DR_MISALIGNMENT (dr);
1275 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1276 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1277 &body_cost_vec);
1278 SET_DR_MISALIGNMENT (dr, save_misalignment);
1281 outside_cost += vect_get_known_peeling_cost
1282 (loop_vinfo, elem->npeel, &dummy,
1283 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1284 &prologue_cost_vec, &epilogue_cost_vec);
1286 /* Prologue and epilogue costs are added to the target model later.
1287 These costs depend only on the scalar iteration cost, the
1288 number of peeling iterations finally chosen, and the number of
1289 misaligned statements. So discard the information found here. */
1290 prologue_cost_vec.release ();
1291 epilogue_cost_vec.release ();
1293 if (inside_cost < min->inside_cost
1294 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1296 min->inside_cost = inside_cost;
1297 min->outside_cost = outside_cost;
1298 min->body_cost_vec.release ();
1299 min->body_cost_vec = body_cost_vec;
1300 min->peel_info.dr = elem->dr;
1301 min->peel_info.npeel = elem->npeel;
1303 else
1304 body_cost_vec.release ();
1306 return 1;
1310 /* Choose best peeling option by traversing peeling hash table and either
1311 choosing an option with the lowest cost (if cost model is enabled) or the
1312 option that aligns as many accesses as possible. */
1314 static struct data_reference *
1315 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1316 loop_vec_info loop_vinfo,
1317 unsigned int *npeel,
1318 stmt_vector_for_cost *body_cost_vec)
1320 struct _vect_peel_extended_info res;
1322 res.peel_info.dr = NULL;
1323 res.body_cost_vec = stmt_vector_for_cost ();
1325 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1327 res.inside_cost = INT_MAX;
1328 res.outside_cost = INT_MAX;
1329 peeling_htab->traverse <_vect_peel_extended_info *,
1330 vect_peeling_hash_get_lowest_cost> (&res);
1332 else
1334 res.peel_info.count = 0;
1335 peeling_htab->traverse <_vect_peel_extended_info *,
1336 vect_peeling_hash_get_most_frequent> (&res);
1339 *npeel = res.peel_info.npeel;
1340 *body_cost_vec = res.body_cost_vec;
1341 return res.peel_info.dr;
1345 /* Function vect_enhance_data_refs_alignment
1347 This pass will use loop versioning and loop peeling in order to enhance
1348 the alignment of data references in the loop.
1350 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1351 original loop is to be vectorized. Any other loops that are created by
1352 the transformations performed in this pass - are not supposed to be
1353 vectorized. This restriction will be relaxed.
1355 This pass will require a cost model to guide it whether to apply peeling
1356 or versioning or a combination of the two. For example, the scheme that
1357 intel uses when given a loop with several memory accesses, is as follows:
1358 choose one memory access ('p') which alignment you want to force by doing
1359 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1360 other accesses are not necessarily aligned, or (2) use loop versioning to
1361 generate one loop in which all accesses are aligned, and another loop in
1362 which only 'p' is necessarily aligned.
1364 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1365 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1366 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1368 Devising a cost model is the most critical aspect of this work. It will
1369 guide us on which access to peel for, whether to use loop versioning, how
1370 many versions to create, etc. The cost model will probably consist of
1371 generic considerations as well as target specific considerations (on
1372 powerpc for example, misaligned stores are more painful than misaligned
1373 loads).
1375 Here are the general steps involved in alignment enhancements:
1377 -- original loop, before alignment analysis:
1378 for (i=0; i<N; i++){
1379 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1380 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1383 -- After vect_compute_data_refs_alignment:
1384 for (i=0; i<N; i++){
1385 x = q[i]; # DR_MISALIGNMENT(q) = 3
1386 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1389 -- Possibility 1: we do loop versioning:
1390 if (p is aligned) {
1391 for (i=0; i<N; i++){ # loop 1A
1392 x = q[i]; # DR_MISALIGNMENT(q) = 3
1393 p[i] = y; # DR_MISALIGNMENT(p) = 0
1396 else {
1397 for (i=0; i<N; i++){ # loop 1B
1398 x = q[i]; # DR_MISALIGNMENT(q) = 3
1399 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1403 -- Possibility 2: we do loop peeling:
1404 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1405 x = q[i];
1406 p[i] = y;
1408 for (i = 3; i < N; i++){ # loop 2A
1409 x = q[i]; # DR_MISALIGNMENT(q) = 0
1410 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1413 -- Possibility 3: combination of loop peeling and versioning:
1414 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1415 x = q[i];
1416 p[i] = y;
1418 if (p is aligned) {
1419 for (i = 3; i<N; i++){ # loop 3A
1420 x = q[i]; # DR_MISALIGNMENT(q) = 0
1421 p[i] = y; # DR_MISALIGNMENT(p) = 0
1424 else {
1425 for (i = 3; i<N; i++){ # loop 3B
1426 x = q[i]; # DR_MISALIGNMENT(q) = 0
1427 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1431 These loops are later passed to loop_transform to be vectorized. The
1432 vectorizer will use the alignment information to guide the transformation
1433 (whether to generate regular loads/stores, or with special handling for
1434 misalignment). */
1436 bool
1437 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1439 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1440 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1441 enum dr_alignment_support supportable_dr_alignment;
1442 struct data_reference *dr0 = NULL, *first_store = NULL;
1443 struct data_reference *dr;
1444 unsigned int i, j;
1445 bool do_peeling = false;
1446 bool do_versioning = false;
1447 bool stat;
1448 gimple *stmt;
1449 stmt_vec_info stmt_info;
1450 unsigned int npeel = 0;
1451 bool all_misalignments_unknown = true;
1452 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1453 unsigned possible_npeel_number = 1;
1454 tree vectype;
1455 unsigned int nelements, mis, same_align_drs_max = 0;
1456 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1457 hash_table<peel_info_hasher> peeling_htab (1);
1459 if (dump_enabled_p ())
1460 dump_printf_loc (MSG_NOTE, vect_location,
1461 "=== vect_enhance_data_refs_alignment ===\n");
1463 /* Reset data so we can safely be called multiple times. */
1464 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1465 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1467 /* While cost model enhancements are expected in the future, the high level
1468 view of the code at this time is as follows:
1470 A) If there is a misaligned access then see if peeling to align
1471 this access can make all data references satisfy
1472 vect_supportable_dr_alignment. If so, update data structures
1473 as needed and return true.
1475 B) If peeling wasn't possible and there is a data reference with an
1476 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1477 then see if loop versioning checks can be used to make all data
1478 references satisfy vect_supportable_dr_alignment. If so, update
1479 data structures as needed and return true.
1481 C) If neither peeling nor versioning were successful then return false if
1482 any data reference does not satisfy vect_supportable_dr_alignment.
1484 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1486 Note, Possibility 3 above (which is peeling and versioning together) is not
1487 being done at this time. */
1489 /* (1) Peeling to force alignment. */
1491 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1492 Considerations:
1493 + How many accesses will become aligned due to the peeling
1494 - How many accesses will become unaligned due to the peeling,
1495 and the cost of misaligned accesses.
1496 - The cost of peeling (the extra runtime checks, the increase
1497 in code size). */
1499 FOR_EACH_VEC_ELT (datarefs, i, dr)
1501 stmt = DR_STMT (dr);
1502 stmt_info = vinfo_for_stmt (stmt);
1504 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1505 continue;
1507 /* For interleaving, only the alignment of the first access
1508 matters. */
1509 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1510 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1511 continue;
1513 /* For invariant accesses there is nothing to enhance. */
1514 if (integer_zerop (DR_STEP (dr)))
1515 continue;
1517 /* Strided accesses perform only component accesses, alignment is
1518 irrelevant for them. */
1519 if (STMT_VINFO_STRIDED_P (stmt_info)
1520 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1521 continue;
1523 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1524 do_peeling = vector_alignment_reachable_p (dr);
1525 if (do_peeling)
1527 if (known_alignment_for_access_p (dr))
1529 unsigned int npeel_tmp;
1530 bool negative = tree_int_cst_compare (DR_STEP (dr),
1531 size_zero_node) < 0;
1533 /* Save info about DR in the hash table. */
1534 vectype = STMT_VINFO_VECTYPE (stmt_info);
1535 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1536 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1537 TREE_TYPE (DR_REF (dr))));
1538 npeel_tmp = (negative
1539 ? (mis - nelements) : (nelements - mis))
1540 & (nelements - 1);
1542 /* For multiple types, it is possible that the bigger type access
1543 will have more than one peeling option. E.g., a loop with two
1544 types: one of size (vector size / 4), and the other one of
1545 size (vector size / 8). Vectorization factor will 8. If both
1546 access are misaligned by 3, the first one needs one scalar
1547 iteration to be aligned, and the second one needs 5. But the
1548 first one will be aligned also by peeling 5 scalar
1549 iterations, and in that case both accesses will be aligned.
1550 Hence, except for the immediate peeling amount, we also want
1551 to try to add full vector size, while we don't exceed
1552 vectorization factor.
1553 We do this automtically for cost model, since we calculate cost
1554 for every peeling option. */
1555 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1557 if (STMT_SLP_TYPE (stmt_info))
1558 possible_npeel_number
1559 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1560 else
1561 possible_npeel_number = vf / nelements;
1564 /* Handle the aligned case. We may decide to align some other
1565 access, making DR unaligned. */
1566 if (DR_MISALIGNMENT (dr) == 0)
1568 npeel_tmp = 0;
1569 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1570 possible_npeel_number++;
1573 for (j = 0; j < possible_npeel_number; j++)
1575 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1576 dr, npeel_tmp);
1577 npeel_tmp += nelements;
1580 all_misalignments_unknown = false;
1581 /* Data-ref that was chosen for the case that all the
1582 misalignments are unknown is not relevant anymore, since we
1583 have a data-ref with known alignment. */
1584 dr0 = NULL;
1586 else
1588 /* If we don't know any misalignment values, we prefer
1589 peeling for data-ref that has the maximum number of data-refs
1590 with the same alignment, unless the target prefers to align
1591 stores over load. */
1592 if (all_misalignments_unknown)
1594 unsigned same_align_drs
1595 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1596 if (!dr0
1597 || same_align_drs_max < same_align_drs)
1599 same_align_drs_max = same_align_drs;
1600 dr0 = dr;
1602 /* For data-refs with the same number of related
1603 accesses prefer the one where the misalign
1604 computation will be invariant in the outermost loop. */
1605 else if (same_align_drs_max == same_align_drs)
1607 struct loop *ivloop0, *ivloop;
1608 ivloop0 = outermost_invariant_loop_for_expr
1609 (loop, DR_BASE_ADDRESS (dr0));
1610 ivloop = outermost_invariant_loop_for_expr
1611 (loop, DR_BASE_ADDRESS (dr));
1612 if ((ivloop && !ivloop0)
1613 || (ivloop && ivloop0
1614 && flow_loop_nested_p (ivloop, ivloop0)))
1615 dr0 = dr;
1618 if (!first_store && DR_IS_WRITE (dr))
1619 first_store = dr;
1622 /* If there are both known and unknown misaligned accesses in the
1623 loop, we choose peeling amount according to the known
1624 accesses. */
1625 if (!supportable_dr_alignment)
1627 dr0 = dr;
1628 if (!first_store && DR_IS_WRITE (dr))
1629 first_store = dr;
1633 else
1635 if (!aligned_access_p (dr))
1637 if (dump_enabled_p ())
1638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639 "vector alignment may not be reachable\n");
1640 break;
1645 /* Check if we can possibly peel the loop. */
1646 if (!vect_can_advance_ivs_p (loop_vinfo)
1647 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1648 || loop->inner)
1649 do_peeling = false;
1651 if (do_peeling
1652 && all_misalignments_unknown
1653 && vect_supportable_dr_alignment (dr0, false))
1655 /* Check if the target requires to prefer stores over loads, i.e., if
1656 misaligned stores are more expensive than misaligned loads (taking
1657 drs with same alignment into account). */
1658 if (first_store && DR_IS_READ (dr0))
1660 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1661 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1662 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1663 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1664 stmt_vector_for_cost dummy;
1665 dummy.create (2);
1667 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1668 &dummy);
1669 vect_get_data_access_cost (first_store, &store_inside_cost,
1670 &store_outside_cost, &dummy);
1672 dummy.release ();
1674 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1675 aligning the load DR0). */
1676 load_inside_penalty = store_inside_cost;
1677 load_outside_penalty = store_outside_cost;
1678 for (i = 0;
1679 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1680 DR_STMT (first_store))).iterate (i, &dr);
1681 i++)
1682 if (DR_IS_READ (dr))
1684 load_inside_penalty += load_inside_cost;
1685 load_outside_penalty += load_outside_cost;
1687 else
1689 load_inside_penalty += store_inside_cost;
1690 load_outside_penalty += store_outside_cost;
1693 /* Calculate the penalty for leaving DR0 unaligned (by
1694 aligning the FIRST_STORE). */
1695 store_inside_penalty = load_inside_cost;
1696 store_outside_penalty = load_outside_cost;
1697 for (i = 0;
1698 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1699 DR_STMT (dr0))).iterate (i, &dr);
1700 i++)
1701 if (DR_IS_READ (dr))
1703 store_inside_penalty += load_inside_cost;
1704 store_outside_penalty += load_outside_cost;
1706 else
1708 store_inside_penalty += store_inside_cost;
1709 store_outside_penalty += store_outside_cost;
1712 if (load_inside_penalty > store_inside_penalty
1713 || (load_inside_penalty == store_inside_penalty
1714 && load_outside_penalty > store_outside_penalty))
1715 dr0 = first_store;
1718 /* In case there are only loads with different unknown misalignments, use
1719 peeling only if it may help to align other accesses in the loop or
1720 if it may help improving load bandwith when we'd end up using
1721 unaligned loads. */
1722 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1723 if (!first_store
1724 && !STMT_VINFO_SAME_ALIGN_REFS (
1725 vinfo_for_stmt (DR_STMT (dr0))).length ()
1726 && (vect_supportable_dr_alignment (dr0, false)
1727 != dr_unaligned_supported
1728 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1729 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1730 do_peeling = false;
1733 if (do_peeling && !dr0)
1735 /* Peeling is possible, but there is no data access that is not supported
1736 unless aligned. So we try to choose the best possible peeling. */
1738 /* We should get here only if there are drs with known misalignment. */
1739 gcc_assert (!all_misalignments_unknown);
1741 /* Choose the best peeling from the hash table. */
1742 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1743 loop_vinfo, &npeel,
1744 &body_cost_vec);
1745 if (!dr0 || !npeel)
1746 do_peeling = false;
1749 if (do_peeling)
1751 stmt = DR_STMT (dr0);
1752 stmt_info = vinfo_for_stmt (stmt);
1753 vectype = STMT_VINFO_VECTYPE (stmt_info);
1754 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1756 if (known_alignment_for_access_p (dr0))
1758 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1759 size_zero_node) < 0;
1760 if (!npeel)
1762 /* Since it's known at compile time, compute the number of
1763 iterations in the peeled loop (the peeling factor) for use in
1764 updating DR_MISALIGNMENT values. The peeling factor is the
1765 vectorization factor minus the misalignment as an element
1766 count. */
1767 mis = DR_MISALIGNMENT (dr0);
1768 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1769 npeel = ((negative ? mis - nelements : nelements - mis)
1770 & (nelements - 1));
1773 /* For interleaved data access every iteration accesses all the
1774 members of the group, therefore we divide the number of iterations
1775 by the group size. */
1776 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1777 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1778 npeel /= GROUP_SIZE (stmt_info);
1780 if (dump_enabled_p ())
1781 dump_printf_loc (MSG_NOTE, vect_location,
1782 "Try peeling by %d\n", npeel);
1785 /* Ensure that all data refs can be vectorized after the peel. */
1786 FOR_EACH_VEC_ELT (datarefs, i, dr)
1788 int save_misalignment;
1790 if (dr == dr0)
1791 continue;
1793 stmt = DR_STMT (dr);
1794 stmt_info = vinfo_for_stmt (stmt);
1795 /* For interleaving, only the alignment of the first access
1796 matters. */
1797 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1798 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1799 continue;
1801 /* Strided accesses perform only component accesses, alignment is
1802 irrelevant for them. */
1803 if (STMT_VINFO_STRIDED_P (stmt_info)
1804 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1805 continue;
1807 save_misalignment = DR_MISALIGNMENT (dr);
1808 vect_update_misalignment_for_peel (dr, dr0, npeel);
1809 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1810 SET_DR_MISALIGNMENT (dr, save_misalignment);
1812 if (!supportable_dr_alignment)
1814 do_peeling = false;
1815 break;
1819 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1821 stat = vect_verify_datarefs_alignment (loop_vinfo);
1822 if (!stat)
1823 do_peeling = false;
1824 else
1826 body_cost_vec.release ();
1827 return stat;
1831 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1832 if (do_peeling)
1834 unsigned max_allowed_peel
1835 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1836 if (max_allowed_peel != (unsigned)-1)
1838 unsigned max_peel = npeel;
1839 if (max_peel == 0)
1841 gimple *dr_stmt = DR_STMT (dr0);
1842 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1843 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1844 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1846 if (max_peel > max_allowed_peel)
1848 do_peeling = false;
1849 if (dump_enabled_p ())
1850 dump_printf_loc (MSG_NOTE, vect_location,
1851 "Disable peeling, max peels reached: %d\n", max_peel);
1856 /* Cost model #2 - if peeling may result in a remaining loop not
1857 iterating enough to be vectorized then do not peel. */
1858 if (do_peeling
1859 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1861 unsigned max_peel
1862 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1863 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1864 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1865 do_peeling = false;
1868 if (do_peeling)
1870 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1871 If the misalignment of DR_i is identical to that of dr0 then set
1872 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1873 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1874 by the peeling factor times the element size of DR_i (MOD the
1875 vectorization factor times the size). Otherwise, the
1876 misalignment of DR_i must be set to unknown. */
1877 FOR_EACH_VEC_ELT (datarefs, i, dr)
1878 if (dr != dr0)
1880 /* Strided accesses perform only component accesses, alignment
1881 is irrelevant for them. */
1882 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1883 if (STMT_VINFO_STRIDED_P (stmt_info)
1884 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1885 continue;
1887 vect_update_misalignment_for_peel (dr, dr0, npeel);
1890 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1891 if (npeel)
1892 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1893 else
1894 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1895 = DR_MISALIGNMENT (dr0);
1896 SET_DR_MISALIGNMENT (dr0, 0);
1897 if (dump_enabled_p ())
1899 dump_printf_loc (MSG_NOTE, vect_location,
1900 "Alignment of access forced using peeling.\n");
1901 dump_printf_loc (MSG_NOTE, vect_location,
1902 "Peeling for alignment will be applied.\n");
1904 /* The inside-loop cost will be accounted for in vectorizable_load
1905 and vectorizable_store correctly with adjusted alignments.
1906 Drop the body_cst_vec on the floor here. */
1907 body_cost_vec.release ();
1909 stat = vect_verify_datarefs_alignment (loop_vinfo);
1910 gcc_assert (stat);
1911 return stat;
1915 body_cost_vec.release ();
1917 /* (2) Versioning to force alignment. */
1919 /* Try versioning if:
1920 1) optimize loop for speed
1921 2) there is at least one unsupported misaligned data ref with an unknown
1922 misalignment, and
1923 3) all misaligned data refs with a known misalignment are supported, and
1924 4) the number of runtime alignment checks is within reason. */
1926 do_versioning =
1927 optimize_loop_nest_for_speed_p (loop)
1928 && (!loop->inner); /* FORNOW */
1930 if (do_versioning)
1932 FOR_EACH_VEC_ELT (datarefs, i, dr)
1934 stmt = DR_STMT (dr);
1935 stmt_info = vinfo_for_stmt (stmt);
1937 /* For interleaving, only the alignment of the first access
1938 matters. */
1939 if (aligned_access_p (dr)
1940 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1941 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1942 continue;
1944 if (STMT_VINFO_STRIDED_P (stmt_info))
1946 /* Strided loads perform only component accesses, alignment is
1947 irrelevant for them. */
1948 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1949 continue;
1950 do_versioning = false;
1951 break;
1954 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1956 if (!supportable_dr_alignment)
1958 gimple *stmt;
1959 int mask;
1960 tree vectype;
1962 if (known_alignment_for_access_p (dr)
1963 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1964 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1966 do_versioning = false;
1967 break;
1970 stmt = DR_STMT (dr);
1971 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1972 gcc_assert (vectype);
1974 /* The rightmost bits of an aligned address must be zeros.
1975 Construct the mask needed for this test. For example,
1976 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1977 mask must be 15 = 0xf. */
1978 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1980 /* FORNOW: use the same mask to test all potentially unaligned
1981 references in the loop. The vectorizer currently supports
1982 a single vector size, see the reference to
1983 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1984 vectorization factor is computed. */
1985 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1986 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1987 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1988 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1989 DR_STMT (dr));
1993 /* Versioning requires at least one misaligned data reference. */
1994 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1995 do_versioning = false;
1996 else if (!do_versioning)
1997 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2000 if (do_versioning)
2002 vec<gimple *> may_misalign_stmts
2003 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2004 gimple *stmt;
2006 /* It can now be assumed that the data references in the statements
2007 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2008 of the loop being vectorized. */
2009 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2011 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2012 dr = STMT_VINFO_DATA_REF (stmt_info);
2013 SET_DR_MISALIGNMENT (dr, 0);
2014 if (dump_enabled_p ())
2015 dump_printf_loc (MSG_NOTE, vect_location,
2016 "Alignment of access forced using versioning.\n");
2019 if (dump_enabled_p ())
2020 dump_printf_loc (MSG_NOTE, vect_location,
2021 "Versioning for alignment will be applied.\n");
2023 /* Peeling and versioning can't be done together at this time. */
2024 gcc_assert (! (do_peeling && do_versioning));
2026 stat = vect_verify_datarefs_alignment (loop_vinfo);
2027 gcc_assert (stat);
2028 return stat;
2031 /* This point is reached if neither peeling nor versioning is being done. */
2032 gcc_assert (! (do_peeling || do_versioning));
2034 stat = vect_verify_datarefs_alignment (loop_vinfo);
2035 return stat;
2039 /* Function vect_find_same_alignment_drs.
2041 Update group and alignment relations according to the chosen
2042 vectorization factor. */
2044 static void
2045 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2046 loop_vec_info loop_vinfo)
2048 unsigned int i;
2049 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2050 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2051 struct data_reference *dra = DDR_A (ddr);
2052 struct data_reference *drb = DDR_B (ddr);
2053 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2054 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2055 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2056 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2057 lambda_vector dist_v;
2058 unsigned int loop_depth;
2060 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2061 return;
2063 if (dra == drb)
2064 return;
2066 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2067 return;
2069 /* Loop-based vectorization and known data dependence. */
2070 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2071 return;
2073 /* Data-dependence analysis reports a distance vector of zero
2074 for data-references that overlap only in the first iteration
2075 but have different sign step (see PR45764).
2076 So as a sanity check require equal DR_STEP. */
2077 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2078 return;
2080 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2081 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2083 int dist = dist_v[loop_depth];
2085 if (dump_enabled_p ())
2086 dump_printf_loc (MSG_NOTE, vect_location,
2087 "dependence distance = %d.\n", dist);
2089 /* Same loop iteration. */
2090 if (dist == 0
2091 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2093 /* Two references with distance zero have the same alignment. */
2094 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2095 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2096 if (dump_enabled_p ())
2098 dump_printf_loc (MSG_NOTE, vect_location,
2099 "accesses have the same alignment.\n");
2100 dump_printf (MSG_NOTE,
2101 "dependence distance modulo vf == 0 between ");
2102 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2103 dump_printf (MSG_NOTE, " and ");
2104 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2105 dump_printf (MSG_NOTE, "\n");
2112 /* Function vect_analyze_data_refs_alignment
2114 Analyze the alignment of the data-references in the loop.
2115 Return FALSE if a data reference is found that cannot be vectorized. */
2117 bool
2118 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2120 if (dump_enabled_p ())
2121 dump_printf_loc (MSG_NOTE, vect_location,
2122 "=== vect_analyze_data_refs_alignment ===\n");
2124 /* Mark groups of data references with same alignment using
2125 data dependence information. */
2126 vec<ddr_p> ddrs = vinfo->ddrs;
2127 struct data_dependence_relation *ddr;
2128 unsigned int i;
2130 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2131 vect_find_same_alignment_drs (ddr, vinfo);
2133 vec<data_reference_p> datarefs = vinfo->datarefs;
2134 struct data_reference *dr;
2136 FOR_EACH_VEC_ELT (datarefs, i, dr)
2138 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2139 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2140 && !vect_compute_data_ref_alignment (dr))
2142 /* Strided accesses perform only component accesses, misalignment
2143 information is irrelevant for them. */
2144 if (STMT_VINFO_STRIDED_P (stmt_info)
2145 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2146 continue;
2148 if (dump_enabled_p ())
2149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2150 "not vectorized: can't calculate alignment "
2151 "for data ref.\n");
2153 return false;
2157 return true;
2161 /* Analyze alignment of DRs of stmts in NODE. */
2163 static bool
2164 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2166 /* We vectorize from the first scalar stmt in the node unless
2167 the node is permuted in which case we start from the first
2168 element in the group. */
2169 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2170 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2171 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2172 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2174 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2175 if (! vect_compute_data_ref_alignment (dr)
2176 /* For creating the data-ref pointer we need alignment of the
2177 first element anyway. */
2178 || (dr != first_dr
2179 && ! vect_compute_data_ref_alignment (first_dr))
2180 || ! verify_data_ref_alignment (dr))
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184 "not vectorized: bad data alignment in basic "
2185 "block.\n");
2186 return false;
2189 return true;
2192 /* Function vect_slp_analyze_instance_alignment
2194 Analyze the alignment of the data-references in the SLP instance.
2195 Return FALSE if a data reference is found that cannot be vectorized. */
2197 bool
2198 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2200 if (dump_enabled_p ())
2201 dump_printf_loc (MSG_NOTE, vect_location,
2202 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2204 slp_tree node;
2205 unsigned i;
2206 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2207 if (! vect_slp_analyze_and_verify_node_alignment (node))
2208 return false;
2210 node = SLP_INSTANCE_TREE (instance);
2211 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2212 && ! vect_slp_analyze_and_verify_node_alignment
2213 (SLP_INSTANCE_TREE (instance)))
2214 return false;
2216 return true;
2220 /* Analyze groups of accesses: check that DR belongs to a group of
2221 accesses of legal size, step, etc. Detect gaps, single element
2222 interleaving, and other special cases. Set grouped access info.
2223 Collect groups of strided stores for further use in SLP analysis.
2224 Worker for vect_analyze_group_access. */
2226 static bool
2227 vect_analyze_group_access_1 (struct data_reference *dr)
2229 tree step = DR_STEP (dr);
2230 tree scalar_type = TREE_TYPE (DR_REF (dr));
2231 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2232 gimple *stmt = DR_STMT (dr);
2233 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2234 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2235 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2236 HOST_WIDE_INT dr_step = -1;
2237 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2238 bool slp_impossible = false;
2240 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2241 size of the interleaving group (including gaps). */
2242 if (tree_fits_shwi_p (step))
2244 dr_step = tree_to_shwi (step);
2245 /* Check that STEP is a multiple of type size. Otherwise there is
2246 a non-element-sized gap at the end of the group which we
2247 cannot represent in GROUP_GAP or GROUP_SIZE.
2248 ??? As we can handle non-constant step fine here we should
2249 simply remove uses of GROUP_GAP between the last and first
2250 element and instead rely on DR_STEP. GROUP_SIZE then would
2251 simply not include that gap. */
2252 if ((dr_step % type_size) != 0)
2254 if (dump_enabled_p ())
2256 dump_printf_loc (MSG_NOTE, vect_location,
2257 "Step ");
2258 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2259 dump_printf (MSG_NOTE,
2260 " is not a multiple of the element size for ");
2261 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2262 dump_printf (MSG_NOTE, "\n");
2264 return false;
2266 groupsize = absu_hwi (dr_step) / type_size;
2268 else
2269 groupsize = 0;
2271 /* Not consecutive access is possible only if it is a part of interleaving. */
2272 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2274 /* Check if it this DR is a part of interleaving, and is a single
2275 element of the group that is accessed in the loop. */
2277 /* Gaps are supported only for loads. STEP must be a multiple of the type
2278 size. The size of the group must be a power of 2. */
2279 if (DR_IS_READ (dr)
2280 && (dr_step % type_size) == 0
2281 && groupsize > 0
2282 && pow2p_hwi (groupsize))
2284 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2285 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2286 GROUP_GAP (stmt_info) = groupsize - 1;
2287 if (dump_enabled_p ())
2289 dump_printf_loc (MSG_NOTE, vect_location,
2290 "Detected single element interleaving ");
2291 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2292 dump_printf (MSG_NOTE, " step ");
2293 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2294 dump_printf (MSG_NOTE, "\n");
2297 return true;
2300 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2303 "not consecutive access ");
2304 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2307 if (bb_vinfo)
2309 /* Mark the statement as unvectorizable. */
2310 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2311 return true;
2314 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2315 STMT_VINFO_STRIDED_P (stmt_info) = true;
2316 return true;
2319 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2321 /* First stmt in the interleaving chain. Check the chain. */
2322 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2323 struct data_reference *data_ref = dr;
2324 unsigned int count = 1;
2325 tree prev_init = DR_INIT (data_ref);
2326 gimple *prev = stmt;
2327 HOST_WIDE_INT diff, gaps = 0;
2329 while (next)
2331 /* Skip same data-refs. In case that two or more stmts share
2332 data-ref (supported only for loads), we vectorize only the first
2333 stmt, and the rest get their vectorized loads from the first
2334 one. */
2335 if (!tree_int_cst_compare (DR_INIT (data_ref),
2336 DR_INIT (STMT_VINFO_DATA_REF (
2337 vinfo_for_stmt (next)))))
2339 if (DR_IS_WRITE (data_ref))
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343 "Two store stmts share the same dr.\n");
2344 return false;
2347 if (dump_enabled_p ())
2348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2349 "Two or more load stmts share the same dr.\n");
2351 /* For load use the same data-ref load. */
2352 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2354 prev = next;
2355 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2356 continue;
2359 prev = next;
2360 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2362 /* All group members have the same STEP by construction. */
2363 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2365 /* Check that the distance between two accesses is equal to the type
2366 size. Otherwise, we have gaps. */
2367 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2368 - TREE_INT_CST_LOW (prev_init)) / type_size;
2369 if (diff != 1)
2371 /* FORNOW: SLP of accesses with gaps is not supported. */
2372 slp_impossible = true;
2373 if (DR_IS_WRITE (data_ref))
2375 if (dump_enabled_p ())
2376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2377 "interleaved store with gaps\n");
2378 return false;
2381 gaps += diff - 1;
2384 last_accessed_element += diff;
2386 /* Store the gap from the previous member of the group. If there is no
2387 gap in the access, GROUP_GAP is always 1. */
2388 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2390 prev_init = DR_INIT (data_ref);
2391 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2392 /* Count the number of data-refs in the chain. */
2393 count++;
2396 if (groupsize == 0)
2397 groupsize = count + gaps;
2399 /* This could be UINT_MAX but as we are generating code in a very
2400 inefficient way we have to cap earlier. See PR78699 for example. */
2401 if (groupsize > 4096)
2403 if (dump_enabled_p ())
2404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2405 "group is too large\n");
2406 return false;
2409 /* Check that the size of the interleaving is equal to count for stores,
2410 i.e., that there are no gaps. */
2411 if (groupsize != count
2412 && !DR_IS_READ (dr))
2414 if (dump_enabled_p ())
2415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2416 "interleaved store with gaps\n");
2417 return false;
2420 /* If there is a gap after the last load in the group it is the
2421 difference between the groupsize and the last accessed
2422 element.
2423 When there is no gap, this difference should be 0. */
2424 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2426 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2427 if (dump_enabled_p ())
2429 dump_printf_loc (MSG_NOTE, vect_location,
2430 "Detected interleaving ");
2431 if (DR_IS_READ (dr))
2432 dump_printf (MSG_NOTE, "load ");
2433 else
2434 dump_printf (MSG_NOTE, "store ");
2435 dump_printf (MSG_NOTE, "of size %u starting with ",
2436 (unsigned)groupsize);
2437 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2438 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2439 dump_printf_loc (MSG_NOTE, vect_location,
2440 "There is a gap of %u elements after the group\n",
2441 GROUP_GAP (vinfo_for_stmt (stmt)));
2444 /* SLP: create an SLP data structure for every interleaving group of
2445 stores for further analysis in vect_analyse_slp. */
2446 if (DR_IS_WRITE (dr) && !slp_impossible)
2448 if (loop_vinfo)
2449 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2450 if (bb_vinfo)
2451 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2455 return true;
2458 /* Analyze groups of accesses: check that DR belongs to a group of
2459 accesses of legal size, step, etc. Detect gaps, single element
2460 interleaving, and other special cases. Set grouped access info.
2461 Collect groups of strided stores for further use in SLP analysis. */
2463 static bool
2464 vect_analyze_group_access (struct data_reference *dr)
2466 if (!vect_analyze_group_access_1 (dr))
2468 /* Dissolve the group if present. */
2469 gimple *next;
2470 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2471 while (stmt)
2473 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2474 next = GROUP_NEXT_ELEMENT (vinfo);
2475 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2476 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2477 stmt = next;
2479 return false;
2481 return true;
2484 /* Analyze the access pattern of the data-reference DR.
2485 In case of non-consecutive accesses call vect_analyze_group_access() to
2486 analyze groups of accesses. */
2488 static bool
2489 vect_analyze_data_ref_access (struct data_reference *dr)
2491 tree step = DR_STEP (dr);
2492 tree scalar_type = TREE_TYPE (DR_REF (dr));
2493 gimple *stmt = DR_STMT (dr);
2494 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2495 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2496 struct loop *loop = NULL;
2498 if (loop_vinfo)
2499 loop = LOOP_VINFO_LOOP (loop_vinfo);
2501 if (loop_vinfo && !step)
2503 if (dump_enabled_p ())
2504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2505 "bad data-ref access in loop\n");
2506 return false;
2509 /* Allow loads with zero step in inner-loop vectorization. */
2510 if (loop_vinfo && integer_zerop (step))
2512 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2513 if (!nested_in_vect_loop_p (loop, stmt))
2514 return DR_IS_READ (dr);
2515 /* Allow references with zero step for outer loops marked
2516 with pragma omp simd only - it guarantees absence of
2517 loop-carried dependencies between inner loop iterations. */
2518 if (!loop->force_vectorize)
2520 if (dump_enabled_p ())
2521 dump_printf_loc (MSG_NOTE, vect_location,
2522 "zero step in inner loop of nest\n");
2523 return false;
2527 if (loop && nested_in_vect_loop_p (loop, stmt))
2529 /* Interleaved accesses are not yet supported within outer-loop
2530 vectorization for references in the inner-loop. */
2531 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2533 /* For the rest of the analysis we use the outer-loop step. */
2534 step = STMT_VINFO_DR_STEP (stmt_info);
2535 if (integer_zerop (step))
2537 if (dump_enabled_p ())
2538 dump_printf_loc (MSG_NOTE, vect_location,
2539 "zero step in outer loop.\n");
2540 return DR_IS_READ (dr);
2544 /* Consecutive? */
2545 if (TREE_CODE (step) == INTEGER_CST)
2547 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2548 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2549 || (dr_step < 0
2550 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2552 /* Mark that it is not interleaving. */
2553 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2554 return true;
2558 if (loop && nested_in_vect_loop_p (loop, stmt))
2560 if (dump_enabled_p ())
2561 dump_printf_loc (MSG_NOTE, vect_location,
2562 "grouped access in outer loop.\n");
2563 return false;
2567 /* Assume this is a DR handled by non-constant strided load case. */
2568 if (TREE_CODE (step) != INTEGER_CST)
2569 return (STMT_VINFO_STRIDED_P (stmt_info)
2570 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2571 || vect_analyze_group_access (dr)));
2573 /* Not consecutive access - check if it's a part of interleaving group. */
2574 return vect_analyze_group_access (dr);
2579 /* A helper function used in the comparator function to sort data
2580 references. T1 and T2 are two data references to be compared.
2581 The function returns -1, 0, or 1. */
2583 static int
2584 compare_tree (tree t1, tree t2)
2586 int i, cmp;
2587 enum tree_code code;
2588 char tclass;
2590 if (t1 == t2)
2591 return 0;
2592 if (t1 == NULL)
2593 return -1;
2594 if (t2 == NULL)
2595 return 1;
2597 STRIP_NOPS (t1);
2598 STRIP_NOPS (t2);
2600 if (TREE_CODE (t1) != TREE_CODE (t2))
2601 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2603 code = TREE_CODE (t1);
2604 switch (code)
2606 /* For const values, we can just use hash values for comparisons. */
2607 case INTEGER_CST:
2608 case REAL_CST:
2609 case FIXED_CST:
2610 case STRING_CST:
2611 case COMPLEX_CST:
2612 case VECTOR_CST:
2614 hashval_t h1 = iterative_hash_expr (t1, 0);
2615 hashval_t h2 = iterative_hash_expr (t2, 0);
2616 if (h1 != h2)
2617 return h1 < h2 ? -1 : 1;
2618 break;
2621 case SSA_NAME:
2622 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2623 if (cmp != 0)
2624 return cmp;
2626 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2627 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2628 break;
2630 default:
2631 tclass = TREE_CODE_CLASS (code);
2633 /* For var-decl, we could compare their UIDs. */
2634 if (tclass == tcc_declaration)
2636 if (DECL_UID (t1) != DECL_UID (t2))
2637 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2638 break;
2641 /* For expressions with operands, compare their operands recursively. */
2642 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2644 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2645 if (cmp != 0)
2646 return cmp;
2650 return 0;
2654 /* Compare two data-references DRA and DRB to group them into chunks
2655 suitable for grouping. */
2657 static int
2658 dr_group_sort_cmp (const void *dra_, const void *drb_)
2660 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2661 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2662 int cmp;
2664 /* Stabilize sort. */
2665 if (dra == drb)
2666 return 0;
2668 /* DRs in different loops never belong to the same group. */
2669 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2670 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2671 if (loopa != loopb)
2672 return loopa->num < loopb->num ? -1 : 1;
2674 /* Ordering of DRs according to base. */
2675 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2677 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2678 if (cmp != 0)
2679 return cmp;
2682 /* And according to DR_OFFSET. */
2683 if (!dr_equal_offsets_p (dra, drb))
2685 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2686 if (cmp != 0)
2687 return cmp;
2690 /* Put reads before writes. */
2691 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2692 return DR_IS_READ (dra) ? -1 : 1;
2694 /* Then sort after access size. */
2695 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2696 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2698 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2699 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2700 if (cmp != 0)
2701 return cmp;
2704 /* And after step. */
2705 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2707 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2708 if (cmp != 0)
2709 return cmp;
2712 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2713 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2714 if (cmp == 0)
2715 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2716 return cmp;
2719 /* Function vect_analyze_data_ref_accesses.
2721 Analyze the access pattern of all the data references in the loop.
2723 FORNOW: the only access pattern that is considered vectorizable is a
2724 simple step 1 (consecutive) access.
2726 FORNOW: handle only arrays and pointer accesses. */
2728 bool
2729 vect_analyze_data_ref_accesses (vec_info *vinfo)
2731 unsigned int i;
2732 vec<data_reference_p> datarefs = vinfo->datarefs;
2733 struct data_reference *dr;
2735 if (dump_enabled_p ())
2736 dump_printf_loc (MSG_NOTE, vect_location,
2737 "=== vect_analyze_data_ref_accesses ===\n");
2739 if (datarefs.is_empty ())
2740 return true;
2742 /* Sort the array of datarefs to make building the interleaving chains
2743 linear. Don't modify the original vector's order, it is needed for
2744 determining what dependencies are reversed. */
2745 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2746 datarefs_copy.qsort (dr_group_sort_cmp);
2748 /* Build the interleaving chains. */
2749 for (i = 0; i < datarefs_copy.length () - 1;)
2751 data_reference_p dra = datarefs_copy[i];
2752 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2753 stmt_vec_info lastinfo = NULL;
2754 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2756 ++i;
2757 continue;
2759 for (i = i + 1; i < datarefs_copy.length (); ++i)
2761 data_reference_p drb = datarefs_copy[i];
2762 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2763 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2764 break;
2766 /* ??? Imperfect sorting (non-compatible types, non-modulo
2767 accesses, same accesses) can lead to a group to be artificially
2768 split here as we don't just skip over those. If it really
2769 matters we can push those to a worklist and re-iterate
2770 over them. The we can just skip ahead to the next DR here. */
2772 /* DRs in a different loop should not be put into the same
2773 interleaving group. */
2774 if (gimple_bb (DR_STMT (dra))->loop_father
2775 != gimple_bb (DR_STMT (drb))->loop_father)
2776 break;
2778 /* Check that the data-refs have same first location (except init)
2779 and they are both either store or load (not load and store,
2780 not masked loads or stores). */
2781 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2782 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2783 DR_BASE_ADDRESS (drb), 0)
2784 || !dr_equal_offsets_p (dra, drb)
2785 || !gimple_assign_single_p (DR_STMT (dra))
2786 || !gimple_assign_single_p (DR_STMT (drb)))
2787 break;
2789 /* Check that the data-refs have the same constant size. */
2790 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2791 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2792 if (!tree_fits_uhwi_p (sza)
2793 || !tree_fits_uhwi_p (szb)
2794 || !tree_int_cst_equal (sza, szb))
2795 break;
2797 /* Check that the data-refs have the same step. */
2798 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2799 break;
2801 /* Do not place the same access in the interleaving chain twice. */
2802 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2803 break;
2805 /* Check the types are compatible.
2806 ??? We don't distinguish this during sorting. */
2807 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2808 TREE_TYPE (DR_REF (drb))))
2809 break;
2811 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2812 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2813 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2814 gcc_assert (init_a <= init_b);
2816 /* If init_b == init_a + the size of the type * k, we have an
2817 interleaving, and DRA is accessed before DRB. */
2818 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2819 if (type_size_a == 0
2820 || (init_b - init_a) % type_size_a != 0)
2821 break;
2823 /* If we have a store, the accesses are adjacent. This splits
2824 groups into chunks we support (we don't support vectorization
2825 of stores with gaps). */
2826 if (!DR_IS_READ (dra)
2827 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2828 (DR_INIT (datarefs_copy[i-1]))
2829 != type_size_a))
2830 break;
2832 /* If the step (if not zero or non-constant) is greater than the
2833 difference between data-refs' inits this splits groups into
2834 suitable sizes. */
2835 if (tree_fits_shwi_p (DR_STEP (dra)))
2837 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2838 if (step != 0 && step <= (init_b - init_a))
2839 break;
2842 if (dump_enabled_p ())
2844 dump_printf_loc (MSG_NOTE, vect_location,
2845 "Detected interleaving ");
2846 if (DR_IS_READ (dra))
2847 dump_printf (MSG_NOTE, "load ");
2848 else
2849 dump_printf (MSG_NOTE, "store ");
2850 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2851 dump_printf (MSG_NOTE, " and ");
2852 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2853 dump_printf (MSG_NOTE, "\n");
2856 /* Link the found element into the group list. */
2857 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2859 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2860 lastinfo = stmtinfo_a;
2862 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2863 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2864 lastinfo = stmtinfo_b;
2868 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2869 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2870 && !vect_analyze_data_ref_access (dr))
2872 if (dump_enabled_p ())
2873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2874 "not vectorized: complicated access pattern.\n");
2876 if (is_a <bb_vec_info> (vinfo))
2878 /* Mark the statement as not vectorizable. */
2879 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2880 continue;
2882 else
2884 datarefs_copy.release ();
2885 return false;
2889 datarefs_copy.release ();
2890 return true;
2894 /* Operator == between two dr_with_seg_len objects.
2896 This equality operator is used to make sure two data refs
2897 are the same one so that we will consider to combine the
2898 aliasing checks of those two pairs of data dependent data
2899 refs. */
2901 static bool
2902 operator == (const dr_with_seg_len& d1,
2903 const dr_with_seg_len& d2)
2905 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2906 DR_BASE_ADDRESS (d2.dr), 0)
2907 && compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0
2908 && compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0
2909 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2912 /* Function comp_dr_with_seg_len_pair.
2914 Comparison function for sorting objects of dr_with_seg_len_pair_t
2915 so that we can combine aliasing checks in one scan. */
2917 static int
2918 comp_dr_with_seg_len_pair (const void *pa_, const void *pb_)
2920 const dr_with_seg_len_pair_t* pa = (const dr_with_seg_len_pair_t *) pa_;
2921 const dr_with_seg_len_pair_t* pb = (const dr_with_seg_len_pair_t *) pb_;
2922 const dr_with_seg_len &a1 = pa->first, &a2 = pa->second;
2923 const dr_with_seg_len &b1 = pb->first, &b2 = pb->second;
2925 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2926 if a and c have the same basic address snd step, and b and d have the same
2927 address and step. Therefore, if any a&c or b&d don't have the same address
2928 and step, we don't care the order of those two pairs after sorting. */
2929 int comp_res;
2931 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a1.dr),
2932 DR_BASE_ADDRESS (b1.dr))) != 0)
2933 return comp_res;
2934 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a2.dr),
2935 DR_BASE_ADDRESS (b2.dr))) != 0)
2936 return comp_res;
2937 if ((comp_res = compare_tree (DR_STEP (a1.dr), DR_STEP (b1.dr))) != 0)
2938 return comp_res;
2939 if ((comp_res = compare_tree (DR_STEP (a2.dr), DR_STEP (b2.dr))) != 0)
2940 return comp_res;
2941 if ((comp_res = compare_tree (DR_OFFSET (a1.dr), DR_OFFSET (b1.dr))) != 0)
2942 return comp_res;
2943 if ((comp_res = compare_tree (DR_INIT (a1.dr), DR_INIT (b1.dr))) != 0)
2944 return comp_res;
2945 if ((comp_res = compare_tree (DR_OFFSET (a2.dr), DR_OFFSET (b2.dr))) != 0)
2946 return comp_res;
2947 if ((comp_res = compare_tree (DR_INIT (a2.dr), DR_INIT (b2.dr))) != 0)
2948 return comp_res;
2950 return 0;
2953 /* Function vect_vfa_segment_size.
2955 Create an expression that computes the size of segment
2956 that will be accessed for a data reference. The functions takes into
2957 account that realignment loads may access one more vector.
2959 Input:
2960 DR: The data reference.
2961 LENGTH_FACTOR: segment length to consider.
2963 Return an expression whose value is the size of segment which will be
2964 accessed by DR. */
2966 static tree
2967 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2969 tree segment_length;
2971 if (integer_zerop (DR_STEP (dr)))
2972 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2973 else
2974 segment_length = size_binop (MULT_EXPR,
2975 fold_convert (sizetype, DR_STEP (dr)),
2976 fold_convert (sizetype, length_factor));
2978 if (vect_supportable_dr_alignment (dr, false)
2979 == dr_explicit_realign_optimized)
2981 tree vector_size = TYPE_SIZE_UNIT
2982 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2984 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2986 return segment_length;
2989 /* Function vect_no_alias_p.
2991 Given data references A and B with equal base and offset, the alias
2992 relation can be decided at compilation time, return TRUE if they do
2993 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2994 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2995 respectively. */
2997 static bool
2998 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2999 tree segment_length_a, tree segment_length_b)
3001 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
3002 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
3003 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
3004 return false;
3006 tree seg_a_min = DR_INIT (a);
3007 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
3008 seg_a_min, segment_length_a);
3009 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3010 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3011 [a, a+12) */
3012 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3014 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
3015 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
3016 seg_a_max, unit_size);
3017 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3018 DR_INIT (a), unit_size);
3020 tree seg_b_min = DR_INIT (b);
3021 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3022 seg_b_min, segment_length_b);
3023 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3025 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3026 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3027 seg_b_max, unit_size);
3028 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3029 DR_INIT (b), unit_size);
3032 if (tree_int_cst_le (seg_a_max, seg_b_min)
3033 || tree_int_cst_le (seg_b_max, seg_a_min))
3034 return true;
3036 return false;
3039 /* Function vect_prune_runtime_alias_test_list.
3041 Prune a list of ddrs to be tested at run-time by versioning for alias.
3042 Merge several alias checks into one if possible.
3043 Return FALSE if resulting list of ddrs is longer then allowed by
3044 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3046 bool
3047 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3049 vec<ddr_p> may_alias_ddrs =
3050 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3051 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
3052 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3053 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3054 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3056 ddr_p ddr;
3057 unsigned int i;
3058 tree length_factor;
3060 if (dump_enabled_p ())
3061 dump_printf_loc (MSG_NOTE, vect_location,
3062 "=== vect_prune_runtime_alias_test_list ===\n");
3064 if (may_alias_ddrs.is_empty ())
3065 return true;
3067 /* Basically, for each pair of dependent data refs store_ptr_0
3068 and load_ptr_0, we create an expression:
3070 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3071 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
3073 for aliasing checks. However, in some cases we can decrease
3074 the number of checks by combining two checks into one. For
3075 example, suppose we have another pair of data refs store_ptr_0
3076 and load_ptr_1, and if the following condition is satisfied:
3078 load_ptr_0 < load_ptr_1 &&
3079 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
3081 (this condition means, in each iteration of vectorized loop,
3082 the accessed memory of store_ptr_0 cannot be between the memory
3083 of load_ptr_0 and load_ptr_1.)
3085 we then can use only the following expression to finish the
3086 alising checks between store_ptr_0 & load_ptr_0 and
3087 store_ptr_0 & load_ptr_1:
3089 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3090 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3092 Note that we only consider that load_ptr_0 and load_ptr_1 have the
3093 same basic address. */
3095 comp_alias_ddrs.create (may_alias_ddrs.length ());
3097 /* First, we collect all data ref pairs for aliasing checks. */
3098 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3100 int comp_res;
3101 struct data_reference *dr_a, *dr_b;
3102 gimple *dr_group_first_a, *dr_group_first_b;
3103 tree segment_length_a, segment_length_b;
3104 gimple *stmt_a, *stmt_b;
3106 dr_a = DDR_A (ddr);
3107 stmt_a = DR_STMT (DDR_A (ddr));
3108 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3109 if (dr_group_first_a)
3111 stmt_a = dr_group_first_a;
3112 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3115 dr_b = DDR_B (ddr);
3116 stmt_b = DR_STMT (DDR_B (ddr));
3117 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3118 if (dr_group_first_b)
3120 stmt_b = dr_group_first_b;
3121 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3124 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3125 length_factor = scalar_loop_iters;
3126 else
3127 length_factor = size_int (vect_factor);
3128 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3129 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3131 comp_res = compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b));
3132 if (comp_res == 0)
3133 comp_res = compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
3135 /* Alias is known at compilation time. */
3136 if (comp_res == 0
3137 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3138 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3139 && TREE_CODE (segment_length_a) == INTEGER_CST
3140 && TREE_CODE (segment_length_b) == INTEGER_CST)
3142 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3143 continue;
3145 if (dump_enabled_p ())
3146 dump_printf_loc (MSG_NOTE, vect_location,
3147 "not vectorized: compilation time alias.\n");
3149 return false;
3152 dr_with_seg_len_pair_t dr_with_seg_len_pair
3153 (dr_with_seg_len (dr_a, segment_length_a),
3154 dr_with_seg_len (dr_b, segment_length_b));
3156 /* Canonicalize pairs by sorting the two DR members. */
3157 if (comp_res > 0)
3158 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3160 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3163 /* Second, we sort the collected data ref pairs so that we can scan
3164 them once to combine all possible aliasing checks. */
3165 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3167 /* Third, we scan the sorted dr pairs and check if we can combine
3168 alias checks of two neighboring dr pairs. */
3169 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3171 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3172 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3173 *dr_b1 = &comp_alias_ddrs[i-1].second,
3174 *dr_a2 = &comp_alias_ddrs[i].first,
3175 *dr_b2 = &comp_alias_ddrs[i].second;
3177 /* Remove duplicate data ref pairs. */
3178 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3180 if (dump_enabled_p ())
3182 dump_printf_loc (MSG_NOTE, vect_location,
3183 "found equal ranges ");
3184 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3185 DR_REF (dr_a1->dr));
3186 dump_printf (MSG_NOTE, ", ");
3187 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3188 DR_REF (dr_b1->dr));
3189 dump_printf (MSG_NOTE, " and ");
3190 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3191 DR_REF (dr_a2->dr));
3192 dump_printf (MSG_NOTE, ", ");
3193 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3194 DR_REF (dr_b2->dr));
3195 dump_printf (MSG_NOTE, "\n");
3198 comp_alias_ddrs.ordered_remove (i--);
3199 continue;
3202 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3204 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3205 and DR_A1 and DR_A2 are two consecutive memrefs. */
3206 if (*dr_a1 == *dr_a2)
3208 std::swap (dr_a1, dr_b1);
3209 std::swap (dr_a2, dr_b2);
3212 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3213 DR_BASE_ADDRESS (dr_a2->dr), 0)
3214 || !operand_equal_p (DR_OFFSET (dr_a1->dr),
3215 DR_OFFSET (dr_a2->dr), 0)
3216 || !tree_fits_shwi_p (DR_INIT (dr_a1->dr))
3217 || !tree_fits_shwi_p (DR_INIT (dr_a2->dr)))
3218 continue;
3220 /* Make sure dr_a1 starts left of dr_a2. */
3221 if (tree_int_cst_lt (DR_INIT (dr_a2->dr), DR_INIT (dr_a1->dr)))
3222 std::swap (*dr_a1, *dr_a2);
3224 bool do_remove = false;
3225 unsigned HOST_WIDE_INT diff
3226 = (tree_to_shwi (DR_INIT (dr_a2->dr))
3227 - tree_to_shwi (DR_INIT (dr_a1->dr)));
3229 /* If the left segment does not extend beyond the start of the
3230 right segment the new segment length is that of the right
3231 plus the segment distance. */
3232 if (tree_fits_uhwi_p (dr_a1->seg_len)
3233 && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3235 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3236 size_int (diff));
3237 do_remove = true;
3239 /* Generally the new segment length is the maximum of the
3240 left segment size and the right segment size plus the distance.
3241 ??? We can also build tree MAX_EXPR here but it's not clear this
3242 is profitable. */
3243 else if (tree_fits_uhwi_p (dr_a1->seg_len)
3244 && tree_fits_uhwi_p (dr_a2->seg_len))
3246 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3247 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3248 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3249 do_remove = true;
3251 /* Now we check if the following condition is satisfied:
3253 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3255 where DIFF = DR_A2_INIT - DR_A1_INIT. However,
3256 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3257 have to make a best estimation. We can get the minimum value
3258 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3259 then either of the following two conditions can guarantee the
3260 one above:
3262 1: DIFF <= MIN_SEG_LEN_B
3263 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3264 else
3266 unsigned HOST_WIDE_INT min_seg_len_b
3267 = (tree_fits_uhwi_p (dr_b1->seg_len)
3268 ? tree_to_uhwi (dr_b1->seg_len)
3269 : vect_factor);
3271 if (diff <= min_seg_len_b
3272 || (tree_fits_uhwi_p (dr_a1->seg_len)
3273 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3275 dr_a1->seg_len = size_binop (PLUS_EXPR,
3276 dr_a2->seg_len, size_int (diff));
3277 do_remove = true;
3281 if (do_remove)
3283 if (dump_enabled_p ())
3285 dump_printf_loc (MSG_NOTE, vect_location,
3286 "merging ranges for ");
3287 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3288 dump_printf (MSG_NOTE, ", ");
3289 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3290 dump_printf (MSG_NOTE, " and ");
3291 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3292 dump_printf (MSG_NOTE, ", ");
3293 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3294 dump_printf (MSG_NOTE, "\n");
3296 comp_alias_ddrs.ordered_remove (i--);
3301 dump_printf_loc (MSG_NOTE, vect_location,
3302 "improved number of alias checks from %d to %d\n",
3303 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3304 if ((int) comp_alias_ddrs.length () >
3305 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3307 if (dump_enabled_p ())
3308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3309 "number of versioning for alias "
3310 "run-time tests exceeds %d "
3311 "(--param vect-max-version-for-alias-checks)\n",
3312 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3313 return false;
3316 /* All alias checks have been resolved at compilation time. */
3317 if (!comp_alias_ddrs.length ())
3318 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3320 return true;
3323 /* Return true if a non-affine read or write in STMT is suitable for a
3324 gather load or scatter store. Describe the operation in *INFO if so. */
3326 bool
3327 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3328 gather_scatter_info *info)
3330 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3331 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3332 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3333 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3334 tree offtype = NULL_TREE;
3335 tree decl, base, off;
3336 machine_mode pmode;
3337 int punsignedp, reversep, pvolatilep = 0;
3339 base = DR_REF (dr);
3340 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3341 see if we can use the def stmt of the address. */
3342 if (is_gimple_call (stmt)
3343 && gimple_call_internal_p (stmt)
3344 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3345 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3346 && TREE_CODE (base) == MEM_REF
3347 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3348 && integer_zerop (TREE_OPERAND (base, 1))
3349 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3351 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3352 if (is_gimple_assign (def_stmt)
3353 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3354 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3357 /* The gather and scatter builtins need address of the form
3358 loop_invariant + vector * {1, 2, 4, 8}
3360 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3361 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3362 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3363 multiplications and additions in it. To get a vector, we need
3364 a single SSA_NAME that will be defined in the loop and will
3365 contain everything that is not loop invariant and that can be
3366 vectorized. The following code attempts to find such a preexistng
3367 SSA_NAME OFF and put the loop invariants into a tree BASE
3368 that can be gimplified before the loop. */
3369 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3370 &punsignedp, &reversep, &pvolatilep);
3371 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3373 if (TREE_CODE (base) == MEM_REF)
3375 if (!integer_zerop (TREE_OPERAND (base, 1)))
3377 if (off == NULL_TREE)
3379 offset_int moff = mem_ref_offset (base);
3380 off = wide_int_to_tree (sizetype, moff);
3382 else
3383 off = size_binop (PLUS_EXPR, off,
3384 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3386 base = TREE_OPERAND (base, 0);
3388 else
3389 base = build_fold_addr_expr (base);
3391 if (off == NULL_TREE)
3392 off = size_zero_node;
3394 /* If base is not loop invariant, either off is 0, then we start with just
3395 the constant offset in the loop invariant BASE and continue with base
3396 as OFF, otherwise give up.
3397 We could handle that case by gimplifying the addition of base + off
3398 into some SSA_NAME and use that as off, but for now punt. */
3399 if (!expr_invariant_in_loop_p (loop, base))
3401 if (!integer_zerop (off))
3402 return false;
3403 off = base;
3404 base = size_int (pbitpos / BITS_PER_UNIT);
3406 /* Otherwise put base + constant offset into the loop invariant BASE
3407 and continue with OFF. */
3408 else
3410 base = fold_convert (sizetype, base);
3411 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3414 /* OFF at this point may be either a SSA_NAME or some tree expression
3415 from get_inner_reference. Try to peel off loop invariants from it
3416 into BASE as long as possible. */
3417 STRIP_NOPS (off);
3418 while (offtype == NULL_TREE)
3420 enum tree_code code;
3421 tree op0, op1, add = NULL_TREE;
3423 if (TREE_CODE (off) == SSA_NAME)
3425 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3427 if (expr_invariant_in_loop_p (loop, off))
3428 return false;
3430 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3431 break;
3433 op0 = gimple_assign_rhs1 (def_stmt);
3434 code = gimple_assign_rhs_code (def_stmt);
3435 op1 = gimple_assign_rhs2 (def_stmt);
3437 else
3439 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3440 return false;
3441 code = TREE_CODE (off);
3442 extract_ops_from_tree (off, &code, &op0, &op1);
3444 switch (code)
3446 case POINTER_PLUS_EXPR:
3447 case PLUS_EXPR:
3448 if (expr_invariant_in_loop_p (loop, op0))
3450 add = op0;
3451 off = op1;
3452 do_add:
3453 add = fold_convert (sizetype, add);
3454 if (scale != 1)
3455 add = size_binop (MULT_EXPR, add, size_int (scale));
3456 base = size_binop (PLUS_EXPR, base, add);
3457 continue;
3459 if (expr_invariant_in_loop_p (loop, op1))
3461 add = op1;
3462 off = op0;
3463 goto do_add;
3465 break;
3466 case MINUS_EXPR:
3467 if (expr_invariant_in_loop_p (loop, op1))
3469 add = fold_convert (sizetype, op1);
3470 add = size_binop (MINUS_EXPR, size_zero_node, add);
3471 off = op0;
3472 goto do_add;
3474 break;
3475 case MULT_EXPR:
3476 if (scale == 1 && tree_fits_shwi_p (op1))
3478 scale = tree_to_shwi (op1);
3479 off = op0;
3480 continue;
3482 break;
3483 case SSA_NAME:
3484 off = op0;
3485 continue;
3486 CASE_CONVERT:
3487 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3488 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3489 break;
3490 if (TYPE_PRECISION (TREE_TYPE (op0))
3491 == TYPE_PRECISION (TREE_TYPE (off)))
3493 off = op0;
3494 continue;
3496 if (TYPE_PRECISION (TREE_TYPE (op0))
3497 < TYPE_PRECISION (TREE_TYPE (off)))
3499 off = op0;
3500 offtype = TREE_TYPE (off);
3501 STRIP_NOPS (off);
3502 continue;
3504 break;
3505 default:
3506 break;
3508 break;
3511 /* If at the end OFF still isn't a SSA_NAME or isn't
3512 defined in the loop, punt. */
3513 if (TREE_CODE (off) != SSA_NAME
3514 || expr_invariant_in_loop_p (loop, off))
3515 return false;
3517 if (offtype == NULL_TREE)
3518 offtype = TREE_TYPE (off);
3520 if (DR_IS_READ (dr))
3521 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3522 offtype, scale);
3523 else
3524 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3525 offtype, scale);
3527 if (decl == NULL_TREE)
3528 return false;
3530 info->decl = decl;
3531 info->base = base;
3532 info->offset = off;
3533 info->offset_dt = vect_unknown_def_type;
3534 info->offset_vectype = NULL_TREE;
3535 info->scale = scale;
3536 return true;
3539 /* Function vect_analyze_data_refs.
3541 Find all the data references in the loop or basic block.
3543 The general structure of the analysis of data refs in the vectorizer is as
3544 follows:
3545 1- vect_analyze_data_refs(loop/bb): call
3546 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3547 in the loop/bb and their dependences.
3548 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3549 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3550 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3554 bool
3555 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3557 struct loop *loop = NULL;
3558 unsigned int i;
3559 struct data_reference *dr;
3560 tree scalar_type;
3562 if (dump_enabled_p ())
3563 dump_printf_loc (MSG_NOTE, vect_location,
3564 "=== vect_analyze_data_refs ===\n");
3566 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3567 loop = LOOP_VINFO_LOOP (loop_vinfo);
3569 /* Go through the data-refs, check that the analysis succeeded. Update
3570 pointer from stmt_vec_info struct to DR and vectype. */
3572 vec<data_reference_p> datarefs = vinfo->datarefs;
3573 FOR_EACH_VEC_ELT (datarefs, i, dr)
3575 gimple *stmt;
3576 stmt_vec_info stmt_info;
3577 tree base, offset, init;
3578 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3579 bool simd_lane_access = false;
3580 int vf;
3582 again:
3583 if (!dr || !DR_REF (dr))
3585 if (dump_enabled_p ())
3586 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3587 "not vectorized: unhandled data-ref\n");
3588 return false;
3591 stmt = DR_STMT (dr);
3592 stmt_info = vinfo_for_stmt (stmt);
3594 /* Discard clobbers from the dataref vector. We will remove
3595 clobber stmts during vectorization. */
3596 if (gimple_clobber_p (stmt))
3598 free_data_ref (dr);
3599 if (i == datarefs.length () - 1)
3601 datarefs.pop ();
3602 break;
3604 datarefs.ordered_remove (i);
3605 dr = datarefs[i];
3606 goto again;
3609 /* Check that analysis of the data-ref succeeded. */
3610 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3611 || !DR_STEP (dr))
3613 bool maybe_gather
3614 = DR_IS_READ (dr)
3615 && !TREE_THIS_VOLATILE (DR_REF (dr))
3616 && targetm.vectorize.builtin_gather != NULL;
3617 bool maybe_scatter
3618 = DR_IS_WRITE (dr)
3619 && !TREE_THIS_VOLATILE (DR_REF (dr))
3620 && targetm.vectorize.builtin_scatter != NULL;
3621 bool maybe_simd_lane_access
3622 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3624 /* If target supports vector gather loads or scatter stores, or if
3625 this might be a SIMD lane access, see if they can't be used. */
3626 if (is_a <loop_vec_info> (vinfo)
3627 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3628 && !nested_in_vect_loop_p (loop, stmt))
3630 struct data_reference *newdr
3631 = create_data_ref (NULL, loop_containing_stmt (stmt),
3632 DR_REF (dr), stmt, maybe_scatter ? false : true);
3633 gcc_assert (newdr != NULL && DR_REF (newdr));
3634 if (DR_BASE_ADDRESS (newdr)
3635 && DR_OFFSET (newdr)
3636 && DR_INIT (newdr)
3637 && DR_STEP (newdr)
3638 && integer_zerop (DR_STEP (newdr)))
3640 if (maybe_simd_lane_access)
3642 tree off = DR_OFFSET (newdr);
3643 STRIP_NOPS (off);
3644 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3645 && TREE_CODE (off) == MULT_EXPR
3646 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3648 tree step = TREE_OPERAND (off, 1);
3649 off = TREE_OPERAND (off, 0);
3650 STRIP_NOPS (off);
3651 if (CONVERT_EXPR_P (off)
3652 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3653 0)))
3654 < TYPE_PRECISION (TREE_TYPE (off)))
3655 off = TREE_OPERAND (off, 0);
3656 if (TREE_CODE (off) == SSA_NAME)
3658 gimple *def = SSA_NAME_DEF_STMT (off);
3659 tree reft = TREE_TYPE (DR_REF (newdr));
3660 if (is_gimple_call (def)
3661 && gimple_call_internal_p (def)
3662 && (gimple_call_internal_fn (def)
3663 == IFN_GOMP_SIMD_LANE))
3665 tree arg = gimple_call_arg (def, 0);
3666 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3667 arg = SSA_NAME_VAR (arg);
3668 if (arg == loop->simduid
3669 /* For now. */
3670 && tree_int_cst_equal
3671 (TYPE_SIZE_UNIT (reft),
3672 step))
3674 DR_OFFSET (newdr) = ssize_int (0);
3675 DR_STEP (newdr) = step;
3676 DR_ALIGNED_TO (newdr)
3677 = size_int (BIGGEST_ALIGNMENT);
3678 dr = newdr;
3679 simd_lane_access = true;
3685 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3687 dr = newdr;
3688 if (maybe_gather)
3689 gatherscatter = GATHER;
3690 else
3691 gatherscatter = SCATTER;
3694 if (gatherscatter == SG_NONE && !simd_lane_access)
3695 free_data_ref (newdr);
3698 if (gatherscatter == SG_NONE && !simd_lane_access)
3700 if (dump_enabled_p ())
3702 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3703 "not vectorized: data ref analysis "
3704 "failed ");
3705 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3708 if (is_a <bb_vec_info> (vinfo))
3709 break;
3711 return false;
3715 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3717 if (dump_enabled_p ())
3718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3719 "not vectorized: base addr of dr is a "
3720 "constant\n");
3722 if (is_a <bb_vec_info> (vinfo))
3723 break;
3725 if (gatherscatter != SG_NONE || simd_lane_access)
3726 free_data_ref (dr);
3727 return false;
3730 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3732 if (dump_enabled_p ())
3734 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3735 "not vectorized: volatile type ");
3736 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3739 if (is_a <bb_vec_info> (vinfo))
3740 break;
3742 return false;
3745 if (stmt_can_throw_internal (stmt))
3747 if (dump_enabled_p ())
3749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3750 "not vectorized: statement can throw an "
3751 "exception ");
3752 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3755 if (is_a <bb_vec_info> (vinfo))
3756 break;
3758 if (gatherscatter != SG_NONE || simd_lane_access)
3759 free_data_ref (dr);
3760 return false;
3763 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3764 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3766 if (dump_enabled_p ())
3768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3769 "not vectorized: statement is bitfield "
3770 "access ");
3771 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3774 if (is_a <bb_vec_info> (vinfo))
3775 break;
3777 if (gatherscatter != SG_NONE || simd_lane_access)
3778 free_data_ref (dr);
3779 return false;
3782 base = unshare_expr (DR_BASE_ADDRESS (dr));
3783 offset = unshare_expr (DR_OFFSET (dr));
3784 init = unshare_expr (DR_INIT (dr));
3786 if (is_gimple_call (stmt)
3787 && (!gimple_call_internal_p (stmt)
3788 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3789 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3791 if (dump_enabled_p ())
3793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3794 "not vectorized: dr in a call ");
3795 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3798 if (is_a <bb_vec_info> (vinfo))
3799 break;
3801 if (gatherscatter != SG_NONE || simd_lane_access)
3802 free_data_ref (dr);
3803 return false;
3806 /* Update DR field in stmt_vec_info struct. */
3808 /* If the dataref is in an inner-loop of the loop that is considered for
3809 for vectorization, we also want to analyze the access relative to
3810 the outer-loop (DR contains information only relative to the
3811 inner-most enclosing loop). We do that by building a reference to the
3812 first location accessed by the inner-loop, and analyze it relative to
3813 the outer-loop. */
3814 if (loop && nested_in_vect_loop_p (loop, stmt))
3816 tree outer_step, outer_base, outer_init;
3817 HOST_WIDE_INT pbitsize, pbitpos;
3818 tree poffset;
3819 machine_mode pmode;
3820 int punsignedp, preversep, pvolatilep;
3821 affine_iv base_iv, offset_iv;
3822 tree dinit;
3824 /* Build a reference to the first location accessed by the
3825 inner-loop: *(BASE+INIT). (The first location is actually
3826 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3827 tree inner_base = build_fold_indirect_ref
3828 (fold_build_pointer_plus (base, init));
3830 if (dump_enabled_p ())
3832 dump_printf_loc (MSG_NOTE, vect_location,
3833 "analyze in outer-loop: ");
3834 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3835 dump_printf (MSG_NOTE, "\n");
3838 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3839 &poffset, &pmode, &punsignedp,
3840 &preversep, &pvolatilep);
3841 gcc_assert (outer_base != NULL_TREE);
3843 if (pbitpos % BITS_PER_UNIT != 0)
3845 if (dump_enabled_p ())
3846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3847 "failed: bit offset alignment.\n");
3848 return false;
3851 if (preversep)
3853 if (dump_enabled_p ())
3854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3855 "failed: reverse storage order.\n");
3856 return false;
3859 outer_base = build_fold_addr_expr (outer_base);
3860 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3861 &base_iv, false))
3863 if (dump_enabled_p ())
3864 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3865 "failed: evolution of base is not affine.\n");
3866 return false;
3869 if (offset)
3871 if (poffset)
3872 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3873 poffset);
3874 else
3875 poffset = offset;
3878 if (!poffset)
3880 offset_iv.base = ssize_int (0);
3881 offset_iv.step = ssize_int (0);
3883 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3884 &offset_iv, false))
3886 if (dump_enabled_p ())
3887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3888 "evolution of offset is not affine.\n");
3889 return false;
3892 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3893 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3894 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3895 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3896 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3898 outer_step = size_binop (PLUS_EXPR,
3899 fold_convert (ssizetype, base_iv.step),
3900 fold_convert (ssizetype, offset_iv.step));
3902 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3903 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3904 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3905 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3906 STMT_VINFO_DR_OFFSET (stmt_info) =
3907 fold_convert (ssizetype, offset_iv.base);
3908 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3909 size_int (highest_pow2_factor (offset_iv.base));
3911 if (dump_enabled_p ())
3913 dump_printf_loc (MSG_NOTE, vect_location,
3914 "\touter base_address: ");
3915 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3916 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3917 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3918 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3919 STMT_VINFO_DR_OFFSET (stmt_info));
3920 dump_printf (MSG_NOTE,
3921 "\n\touter constant offset from base address: ");
3922 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3923 STMT_VINFO_DR_INIT (stmt_info));
3924 dump_printf (MSG_NOTE, "\n\touter step: ");
3925 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3926 STMT_VINFO_DR_STEP (stmt_info));
3927 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3928 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3929 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3930 dump_printf (MSG_NOTE, "\n");
3934 if (STMT_VINFO_DATA_REF (stmt_info))
3936 if (dump_enabled_p ())
3938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3939 "not vectorized: more than one data ref "
3940 "in stmt: ");
3941 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3944 if (is_a <bb_vec_info> (vinfo))
3945 break;
3947 if (gatherscatter != SG_NONE || simd_lane_access)
3948 free_data_ref (dr);
3949 return false;
3952 STMT_VINFO_DATA_REF (stmt_info) = dr;
3953 if (simd_lane_access)
3955 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3956 free_data_ref (datarefs[i]);
3957 datarefs[i] = dr;
3960 /* Set vectype for STMT. */
3961 scalar_type = TREE_TYPE (DR_REF (dr));
3962 STMT_VINFO_VECTYPE (stmt_info)
3963 = get_vectype_for_scalar_type (scalar_type);
3964 if (!STMT_VINFO_VECTYPE (stmt_info))
3966 if (dump_enabled_p ())
3968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3969 "not vectorized: no vectype for stmt: ");
3970 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3971 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3972 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3973 scalar_type);
3974 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3977 if (is_a <bb_vec_info> (vinfo))
3979 /* No vector type is fine, the ref can still participate
3980 in dependence analysis, we just can't vectorize it. */
3981 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3982 continue;
3985 if (gatherscatter != SG_NONE || simd_lane_access)
3987 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3988 if (gatherscatter != SG_NONE)
3989 free_data_ref (dr);
3991 return false;
3993 else
3995 if (dump_enabled_p ())
3997 dump_printf_loc (MSG_NOTE, vect_location,
3998 "got vectype for stmt: ");
3999 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4000 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4001 STMT_VINFO_VECTYPE (stmt_info));
4002 dump_printf (MSG_NOTE, "\n");
4006 /* Adjust the minimal vectorization factor according to the
4007 vector type. */
4008 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4009 if (vf > *min_vf)
4010 *min_vf = vf;
4012 if (gatherscatter != SG_NONE)
4014 gather_scatter_info gs_info;
4015 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4016 &gs_info)
4017 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4019 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4020 free_data_ref (dr);
4021 if (dump_enabled_p ())
4023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4024 (gatherscatter == GATHER) ?
4025 "not vectorized: not suitable for gather "
4026 "load " :
4027 "not vectorized: not suitable for scatter "
4028 "store ");
4029 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4031 return false;
4034 free_data_ref (datarefs[i]);
4035 datarefs[i] = dr;
4036 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4039 else if (is_a <loop_vec_info> (vinfo)
4040 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4042 if (nested_in_vect_loop_p (loop, stmt))
4044 if (dump_enabled_p ())
4046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4047 "not vectorized: not suitable for strided "
4048 "load ");
4049 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4051 return false;
4053 STMT_VINFO_STRIDED_P (stmt_info) = true;
4057 /* If we stopped analysis at the first dataref we could not analyze
4058 when trying to vectorize a basic-block mark the rest of the datarefs
4059 as not vectorizable and truncate the vector of datarefs. That
4060 avoids spending useless time in analyzing their dependence. */
4061 if (i != datarefs.length ())
4063 gcc_assert (is_a <bb_vec_info> (vinfo));
4064 for (unsigned j = i; j < datarefs.length (); ++j)
4066 data_reference_p dr = datarefs[j];
4067 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4068 free_data_ref (dr);
4070 datarefs.truncate (i);
4073 return true;
4077 /* Function vect_get_new_vect_var.
4079 Returns a name for a new variable. The current naming scheme appends the
4080 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4081 the name of vectorizer generated variables, and appends that to NAME if
4082 provided. */
4084 tree
4085 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4087 const char *prefix;
4088 tree new_vect_var;
4090 switch (var_kind)
4092 case vect_simple_var:
4093 prefix = "vect";
4094 break;
4095 case vect_scalar_var:
4096 prefix = "stmp";
4097 break;
4098 case vect_mask_var:
4099 prefix = "mask";
4100 break;
4101 case vect_pointer_var:
4102 prefix = "vectp";
4103 break;
4104 default:
4105 gcc_unreachable ();
4108 if (name)
4110 char* tmp = concat (prefix, "_", name, NULL);
4111 new_vect_var = create_tmp_reg (type, tmp);
4112 free (tmp);
4114 else
4115 new_vect_var = create_tmp_reg (type, prefix);
4117 return new_vect_var;
4120 /* Like vect_get_new_vect_var but return an SSA name. */
4122 tree
4123 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4125 const char *prefix;
4126 tree new_vect_var;
4128 switch (var_kind)
4130 case vect_simple_var:
4131 prefix = "vect";
4132 break;
4133 case vect_scalar_var:
4134 prefix = "stmp";
4135 break;
4136 case vect_pointer_var:
4137 prefix = "vectp";
4138 break;
4139 default:
4140 gcc_unreachable ();
4143 if (name)
4145 char* tmp = concat (prefix, "_", name, NULL);
4146 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4147 free (tmp);
4149 else
4150 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4152 return new_vect_var;
4155 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4157 static void
4158 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4159 stmt_vec_info stmt_info)
4161 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4162 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4163 int misalign = DR_MISALIGNMENT (dr);
4164 if (misalign == -1)
4165 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4166 else
4167 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4170 /* Function vect_create_addr_base_for_vector_ref.
4172 Create an expression that computes the address of the first memory location
4173 that will be accessed for a data reference.
4175 Input:
4176 STMT: The statement containing the data reference.
4177 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4178 OFFSET: Optional. If supplied, it is be added to the initial address.
4179 LOOP: Specify relative to which loop-nest should the address be computed.
4180 For example, when the dataref is in an inner-loop nested in an
4181 outer-loop that is now being vectorized, LOOP can be either the
4182 outer-loop, or the inner-loop. The first memory location accessed
4183 by the following dataref ('in' points to short):
4185 for (i=0; i<N; i++)
4186 for (j=0; j<M; j++)
4187 s += in[i+j]
4189 is as follows:
4190 if LOOP=i_loop: &in (relative to i_loop)
4191 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4192 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4193 initial address. Unlike OFFSET, which is number of elements to
4194 be added, BYTE_OFFSET is measured in bytes.
4196 Output:
4197 1. Return an SSA_NAME whose value is the address of the memory location of
4198 the first vector of the data reference.
4199 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4200 these statement(s) which define the returned SSA_NAME.
4202 FORNOW: We are only handling array accesses with step 1. */
4204 tree
4205 vect_create_addr_base_for_vector_ref (gimple *stmt,
4206 gimple_seq *new_stmt_list,
4207 tree offset,
4208 struct loop *loop,
4209 tree byte_offset)
4211 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4212 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4213 tree data_ref_base;
4214 const char *base_name;
4215 tree addr_base;
4216 tree dest;
4217 gimple_seq seq = NULL;
4218 tree base_offset;
4219 tree init;
4220 tree vect_ptr_type;
4221 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4222 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4224 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4226 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4228 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4230 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4231 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4232 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4234 else
4236 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4237 base_offset = unshare_expr (DR_OFFSET (dr));
4238 init = unshare_expr (DR_INIT (dr));
4241 if (loop_vinfo)
4242 base_name = get_name (data_ref_base);
4243 else
4245 base_offset = ssize_int (0);
4246 init = ssize_int (0);
4247 base_name = get_name (DR_REF (dr));
4250 /* Create base_offset */
4251 base_offset = size_binop (PLUS_EXPR,
4252 fold_convert (sizetype, base_offset),
4253 fold_convert (sizetype, init));
4255 if (offset)
4257 offset = fold_build2 (MULT_EXPR, sizetype,
4258 fold_convert (sizetype, offset), step);
4259 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4260 base_offset, offset);
4262 if (byte_offset)
4264 byte_offset = fold_convert (sizetype, byte_offset);
4265 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4266 base_offset, byte_offset);
4269 /* base + base_offset */
4270 if (loop_vinfo)
4271 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4272 else
4274 addr_base = build1 (ADDR_EXPR,
4275 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4276 unshare_expr (DR_REF (dr)));
4279 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4280 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4281 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4282 gimple_seq_add_seq (new_stmt_list, seq);
4284 if (DR_PTR_INFO (dr)
4285 && TREE_CODE (addr_base) == SSA_NAME
4286 && !SSA_NAME_PTR_INFO (addr_base))
4288 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4289 if (offset || byte_offset)
4290 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4293 if (dump_enabled_p ())
4295 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4296 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4297 dump_printf (MSG_NOTE, "\n");
4300 return addr_base;
4304 /* Function vect_create_data_ref_ptr.
4306 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4307 location accessed in the loop by STMT, along with the def-use update
4308 chain to appropriately advance the pointer through the loop iterations.
4309 Also set aliasing information for the pointer. This pointer is used by
4310 the callers to this function to create a memory reference expression for
4311 vector load/store access.
4313 Input:
4314 1. STMT: a stmt that references memory. Expected to be of the form
4315 GIMPLE_ASSIGN <name, data-ref> or
4316 GIMPLE_ASSIGN <data-ref, name>.
4317 2. AGGR_TYPE: the type of the reference, which should be either a vector
4318 or an array.
4319 3. AT_LOOP: the loop where the vector memref is to be created.
4320 4. OFFSET (optional): an offset to be added to the initial address accessed
4321 by the data-ref in STMT.
4322 5. BSI: location where the new stmts are to be placed if there is no loop
4323 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4324 pointing to the initial address.
4325 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4326 to the initial address accessed by the data-ref in STMT. This is
4327 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4328 in bytes.
4330 Output:
4331 1. Declare a new ptr to vector_type, and have it point to the base of the
4332 data reference (initial addressed accessed by the data reference).
4333 For example, for vector of type V8HI, the following code is generated:
4335 v8hi *ap;
4336 ap = (v8hi *)initial_address;
4338 if OFFSET is not supplied:
4339 initial_address = &a[init];
4340 if OFFSET is supplied:
4341 initial_address = &a[init + OFFSET];
4342 if BYTE_OFFSET is supplied:
4343 initial_address = &a[init] + BYTE_OFFSET;
4345 Return the initial_address in INITIAL_ADDRESS.
4347 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4348 update the pointer in each iteration of the loop.
4350 Return the increment stmt that updates the pointer in PTR_INCR.
4352 3. Set INV_P to true if the access pattern of the data reference in the
4353 vectorized loop is invariant. Set it to false otherwise.
4355 4. Return the pointer. */
4357 tree
4358 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4359 tree offset, tree *initial_address,
4360 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4361 bool only_init, bool *inv_p, tree byte_offset)
4363 const char *base_name;
4364 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4365 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4366 struct loop *loop = NULL;
4367 bool nested_in_vect_loop = false;
4368 struct loop *containing_loop = NULL;
4369 tree aggr_ptr_type;
4370 tree aggr_ptr;
4371 tree new_temp;
4372 gimple_seq new_stmt_list = NULL;
4373 edge pe = NULL;
4374 basic_block new_bb;
4375 tree aggr_ptr_init;
4376 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4377 tree aptr;
4378 gimple_stmt_iterator incr_gsi;
4379 bool insert_after;
4380 tree indx_before_incr, indx_after_incr;
4381 gimple *incr;
4382 tree step;
4383 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4385 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4386 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4388 if (loop_vinfo)
4390 loop = LOOP_VINFO_LOOP (loop_vinfo);
4391 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4392 containing_loop = (gimple_bb (stmt))->loop_father;
4393 pe = loop_preheader_edge (loop);
4395 else
4397 gcc_assert (bb_vinfo);
4398 only_init = true;
4399 *ptr_incr = NULL;
4402 /* Check the step (evolution) of the load in LOOP, and record
4403 whether it's invariant. */
4404 if (nested_in_vect_loop)
4405 step = STMT_VINFO_DR_STEP (stmt_info);
4406 else
4407 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4409 if (integer_zerop (step))
4410 *inv_p = true;
4411 else
4412 *inv_p = false;
4414 /* Create an expression for the first address accessed by this load
4415 in LOOP. */
4416 base_name = get_name (DR_BASE_ADDRESS (dr));
4418 if (dump_enabled_p ())
4420 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4421 dump_printf_loc (MSG_NOTE, vect_location,
4422 "create %s-pointer variable to type: ",
4423 get_tree_code_name (TREE_CODE (aggr_type)));
4424 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4425 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4426 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4427 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4428 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4429 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4430 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4431 else
4432 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4433 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4434 dump_printf (MSG_NOTE, "\n");
4437 /* (1) Create the new aggregate-pointer variable.
4438 Vector and array types inherit the alias set of their component
4439 type by default so we need to use a ref-all pointer if the data
4440 reference does not conflict with the created aggregated data
4441 reference because it is not addressable. */
4442 bool need_ref_all = false;
4443 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4444 get_alias_set (DR_REF (dr))))
4445 need_ref_all = true;
4446 /* Likewise for any of the data references in the stmt group. */
4447 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4449 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4452 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4453 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4454 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4455 get_alias_set (DR_REF (sdr))))
4457 need_ref_all = true;
4458 break;
4460 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4462 while (orig_stmt);
4464 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4465 need_ref_all);
4466 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4469 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4470 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4471 def-use update cycles for the pointer: one relative to the outer-loop
4472 (LOOP), which is what steps (3) and (4) below do. The other is relative
4473 to the inner-loop (which is the inner-most loop containing the dataref),
4474 and this is done be step (5) below.
4476 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4477 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4478 redundant. Steps (3),(4) create the following:
4480 vp0 = &base_addr;
4481 LOOP: vp1 = phi(vp0,vp2)
4484 vp2 = vp1 + step
4485 goto LOOP
4487 If there is an inner-loop nested in loop, then step (5) will also be
4488 applied, and an additional update in the inner-loop will be created:
4490 vp0 = &base_addr;
4491 LOOP: vp1 = phi(vp0,vp2)
4493 inner: vp3 = phi(vp1,vp4)
4494 vp4 = vp3 + inner_step
4495 if () goto inner
4497 vp2 = vp1 + step
4498 if () goto LOOP */
4500 /* (2) Calculate the initial address of the aggregate-pointer, and set
4501 the aggregate-pointer to point to it before the loop. */
4503 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4505 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4506 offset, loop, byte_offset);
4507 if (new_stmt_list)
4509 if (pe)
4511 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4512 gcc_assert (!new_bb);
4514 else
4515 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4518 *initial_address = new_temp;
4519 aggr_ptr_init = new_temp;
4521 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4522 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4523 inner-loop nested in LOOP (during outer-loop vectorization). */
4525 /* No update in loop is required. */
4526 if (only_init && (!loop_vinfo || at_loop == loop))
4527 aptr = aggr_ptr_init;
4528 else
4530 /* The step of the aggregate pointer is the type size. */
4531 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4532 /* One exception to the above is when the scalar step of the load in
4533 LOOP is zero. In this case the step here is also zero. */
4534 if (*inv_p)
4535 iv_step = size_zero_node;
4536 else if (tree_int_cst_sgn (step) == -1)
4537 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4539 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4541 create_iv (aggr_ptr_init,
4542 fold_convert (aggr_ptr_type, iv_step),
4543 aggr_ptr, loop, &incr_gsi, insert_after,
4544 &indx_before_incr, &indx_after_incr);
4545 incr = gsi_stmt (incr_gsi);
4546 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4548 /* Copy the points-to information if it exists. */
4549 if (DR_PTR_INFO (dr))
4551 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4552 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4554 if (ptr_incr)
4555 *ptr_incr = incr;
4557 aptr = indx_before_incr;
4560 if (!nested_in_vect_loop || only_init)
4561 return aptr;
4564 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4565 nested in LOOP, if exists. */
4567 gcc_assert (nested_in_vect_loop);
4568 if (!only_init)
4570 standard_iv_increment_position (containing_loop, &incr_gsi,
4571 &insert_after);
4572 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4573 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4574 &indx_after_incr);
4575 incr = gsi_stmt (incr_gsi);
4576 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4578 /* Copy the points-to information if it exists. */
4579 if (DR_PTR_INFO (dr))
4581 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4582 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4584 if (ptr_incr)
4585 *ptr_incr = incr;
4587 return indx_before_incr;
4589 else
4590 gcc_unreachable ();
4594 /* Function bump_vector_ptr
4596 Increment a pointer (to a vector type) by vector-size. If requested,
4597 i.e. if PTR-INCR is given, then also connect the new increment stmt
4598 to the existing def-use update-chain of the pointer, by modifying
4599 the PTR_INCR as illustrated below:
4601 The pointer def-use update-chain before this function:
4602 DATAREF_PTR = phi (p_0, p_2)
4603 ....
4604 PTR_INCR: p_2 = DATAREF_PTR + step
4606 The pointer def-use update-chain after this function:
4607 DATAREF_PTR = phi (p_0, p_2)
4608 ....
4609 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4610 ....
4611 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4613 Input:
4614 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4615 in the loop.
4616 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4617 the loop. The increment amount across iterations is expected
4618 to be vector_size.
4619 BSI - location where the new update stmt is to be placed.
4620 STMT - the original scalar memory-access stmt that is being vectorized.
4621 BUMP - optional. The offset by which to bump the pointer. If not given,
4622 the offset is assumed to be vector_size.
4624 Output: Return NEW_DATAREF_PTR as illustrated above.
4628 tree
4629 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4630 gimple *stmt, tree bump)
4632 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4633 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4634 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4635 tree update = TYPE_SIZE_UNIT (vectype);
4636 gassign *incr_stmt;
4637 ssa_op_iter iter;
4638 use_operand_p use_p;
4639 tree new_dataref_ptr;
4641 if (bump)
4642 update = bump;
4644 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4645 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4646 else
4647 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4648 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4649 dataref_ptr, update);
4650 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4652 /* Copy the points-to information if it exists. */
4653 if (DR_PTR_INFO (dr))
4655 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4656 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4659 if (!ptr_incr)
4660 return new_dataref_ptr;
4662 /* Update the vector-pointer's cross-iteration increment. */
4663 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4665 tree use = USE_FROM_PTR (use_p);
4667 if (use == dataref_ptr)
4668 SET_USE (use_p, new_dataref_ptr);
4669 else
4670 gcc_assert (tree_int_cst_compare (use, update) == 0);
4673 return new_dataref_ptr;
4677 /* Function vect_create_destination_var.
4679 Create a new temporary of type VECTYPE. */
4681 tree
4682 vect_create_destination_var (tree scalar_dest, tree vectype)
4684 tree vec_dest;
4685 const char *name;
4686 char *new_name;
4687 tree type;
4688 enum vect_var_kind kind;
4690 kind = vectype
4691 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4692 ? vect_mask_var
4693 : vect_simple_var
4694 : vect_scalar_var;
4695 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4697 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4699 name = get_name (scalar_dest);
4700 if (name)
4701 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4702 else
4703 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4704 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4705 free (new_name);
4707 return vec_dest;
4710 /* Function vect_grouped_store_supported.
4712 Returns TRUE if interleave high and interleave low permutations
4713 are supported, and FALSE otherwise. */
4715 bool
4716 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4718 machine_mode mode = TYPE_MODE (vectype);
4720 /* vect_permute_store_chain requires the group size to be equal to 3 or
4721 be a power of two. */
4722 if (count != 3 && exact_log2 (count) == -1)
4724 if (dump_enabled_p ())
4725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4726 "the size of the group of accesses"
4727 " is not a power of 2 or not eqaul to 3\n");
4728 return false;
4731 /* Check that the permutation is supported. */
4732 if (VECTOR_MODE_P (mode))
4734 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4735 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4737 if (count == 3)
4739 unsigned int j0 = 0, j1 = 0, j2 = 0;
4740 unsigned int i, j;
4742 for (j = 0; j < 3; j++)
4744 int nelt0 = ((3 - j) * nelt) % 3;
4745 int nelt1 = ((3 - j) * nelt + 1) % 3;
4746 int nelt2 = ((3 - j) * nelt + 2) % 3;
4747 for (i = 0; i < nelt; i++)
4749 if (3 * i + nelt0 < nelt)
4750 sel[3 * i + nelt0] = j0++;
4751 if (3 * i + nelt1 < nelt)
4752 sel[3 * i + nelt1] = nelt + j1++;
4753 if (3 * i + nelt2 < nelt)
4754 sel[3 * i + nelt2] = 0;
4756 if (!can_vec_perm_p (mode, false, sel))
4758 if (dump_enabled_p ())
4759 dump_printf (MSG_MISSED_OPTIMIZATION,
4760 "permutaion op not supported by target.\n");
4761 return false;
4764 for (i = 0; i < nelt; i++)
4766 if (3 * i + nelt0 < nelt)
4767 sel[3 * i + nelt0] = 3 * i + nelt0;
4768 if (3 * i + nelt1 < nelt)
4769 sel[3 * i + nelt1] = 3 * i + nelt1;
4770 if (3 * i + nelt2 < nelt)
4771 sel[3 * i + nelt2] = nelt + j2++;
4773 if (!can_vec_perm_p (mode, false, sel))
4775 if (dump_enabled_p ())
4776 dump_printf (MSG_MISSED_OPTIMIZATION,
4777 "permutaion op not supported by target.\n");
4778 return false;
4781 return true;
4783 else
4785 /* If length is not equal to 3 then only power of 2 is supported. */
4786 gcc_assert (pow2p_hwi (count));
4788 for (i = 0; i < nelt / 2; i++)
4790 sel[i * 2] = i;
4791 sel[i * 2 + 1] = i + nelt;
4793 if (can_vec_perm_p (mode, false, sel))
4795 for (i = 0; i < nelt; i++)
4796 sel[i] += nelt / 2;
4797 if (can_vec_perm_p (mode, false, sel))
4798 return true;
4803 if (dump_enabled_p ())
4804 dump_printf (MSG_MISSED_OPTIMIZATION,
4805 "permutaion op not supported by target.\n");
4806 return false;
4810 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4811 type VECTYPE. */
4813 bool
4814 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4816 return vect_lanes_optab_supported_p ("vec_store_lanes",
4817 vec_store_lanes_optab,
4818 vectype, count);
4822 /* Function vect_permute_store_chain.
4824 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4825 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4826 the data correctly for the stores. Return the final references for stores
4827 in RESULT_CHAIN.
4829 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4830 The input is 4 vectors each containing 8 elements. We assign a number to
4831 each element, the input sequence is:
4833 1st vec: 0 1 2 3 4 5 6 7
4834 2nd vec: 8 9 10 11 12 13 14 15
4835 3rd vec: 16 17 18 19 20 21 22 23
4836 4th vec: 24 25 26 27 28 29 30 31
4838 The output sequence should be:
4840 1st vec: 0 8 16 24 1 9 17 25
4841 2nd vec: 2 10 18 26 3 11 19 27
4842 3rd vec: 4 12 20 28 5 13 21 30
4843 4th vec: 6 14 22 30 7 15 23 31
4845 i.e., we interleave the contents of the four vectors in their order.
4847 We use interleave_high/low instructions to create such output. The input of
4848 each interleave_high/low operation is two vectors:
4849 1st vec 2nd vec
4850 0 1 2 3 4 5 6 7
4851 the even elements of the result vector are obtained left-to-right from the
4852 high/low elements of the first vector. The odd elements of the result are
4853 obtained left-to-right from the high/low elements of the second vector.
4854 The output of interleave_high will be: 0 4 1 5
4855 and of interleave_low: 2 6 3 7
4858 The permutation is done in log LENGTH stages. In each stage interleave_high
4859 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4860 where the first argument is taken from the first half of DR_CHAIN and the
4861 second argument from it's second half.
4862 In our example,
4864 I1: interleave_high (1st vec, 3rd vec)
4865 I2: interleave_low (1st vec, 3rd vec)
4866 I3: interleave_high (2nd vec, 4th vec)
4867 I4: interleave_low (2nd vec, 4th vec)
4869 The output for the first stage is:
4871 I1: 0 16 1 17 2 18 3 19
4872 I2: 4 20 5 21 6 22 7 23
4873 I3: 8 24 9 25 10 26 11 27
4874 I4: 12 28 13 29 14 30 15 31
4876 The output of the second stage, i.e. the final result is:
4878 I1: 0 8 16 24 1 9 17 25
4879 I2: 2 10 18 26 3 11 19 27
4880 I3: 4 12 20 28 5 13 21 30
4881 I4: 6 14 22 30 7 15 23 31. */
4883 void
4884 vect_permute_store_chain (vec<tree> dr_chain,
4885 unsigned int length,
4886 gimple *stmt,
4887 gimple_stmt_iterator *gsi,
4888 vec<tree> *result_chain)
4890 tree vect1, vect2, high, low;
4891 gimple *perm_stmt;
4892 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4893 tree perm_mask_low, perm_mask_high;
4894 tree data_ref;
4895 tree perm3_mask_low, perm3_mask_high;
4896 unsigned int i, n, log_length = exact_log2 (length);
4897 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4898 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4900 result_chain->quick_grow (length);
4901 memcpy (result_chain->address (), dr_chain.address (),
4902 length * sizeof (tree));
4904 if (length == 3)
4906 unsigned int j0 = 0, j1 = 0, j2 = 0;
4908 for (j = 0; j < 3; j++)
4910 int nelt0 = ((3 - j) * nelt) % 3;
4911 int nelt1 = ((3 - j) * nelt + 1) % 3;
4912 int nelt2 = ((3 - j) * nelt + 2) % 3;
4914 for (i = 0; i < nelt; i++)
4916 if (3 * i + nelt0 < nelt)
4917 sel[3 * i + nelt0] = j0++;
4918 if (3 * i + nelt1 < nelt)
4919 sel[3 * i + nelt1] = nelt + j1++;
4920 if (3 * i + nelt2 < nelt)
4921 sel[3 * i + nelt2] = 0;
4923 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4925 for (i = 0; i < nelt; i++)
4927 if (3 * i + nelt0 < nelt)
4928 sel[3 * i + nelt0] = 3 * i + nelt0;
4929 if (3 * i + nelt1 < nelt)
4930 sel[3 * i + nelt1] = 3 * i + nelt1;
4931 if (3 * i + nelt2 < nelt)
4932 sel[3 * i + nelt2] = nelt + j2++;
4934 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4936 vect1 = dr_chain[0];
4937 vect2 = dr_chain[1];
4939 /* Create interleaving stmt:
4940 low = VEC_PERM_EXPR <vect1, vect2,
4941 {j, nelt, *, j + 1, nelt + j + 1, *,
4942 j + 2, nelt + j + 2, *, ...}> */
4943 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4944 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4945 vect2, perm3_mask_low);
4946 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4948 vect1 = data_ref;
4949 vect2 = dr_chain[2];
4950 /* Create interleaving stmt:
4951 low = VEC_PERM_EXPR <vect1, vect2,
4952 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4953 6, 7, nelt + j + 2, ...}> */
4954 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4955 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4956 vect2, perm3_mask_high);
4957 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4958 (*result_chain)[j] = data_ref;
4961 else
4963 /* If length is not equal to 3 then only power of 2 is supported. */
4964 gcc_assert (pow2p_hwi (length));
4966 for (i = 0, n = nelt / 2; i < n; i++)
4968 sel[i * 2] = i;
4969 sel[i * 2 + 1] = i + nelt;
4971 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4973 for (i = 0; i < nelt; i++)
4974 sel[i] += nelt / 2;
4975 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4977 for (i = 0, n = log_length; i < n; i++)
4979 for (j = 0; j < length/2; j++)
4981 vect1 = dr_chain[j];
4982 vect2 = dr_chain[j+length/2];
4984 /* Create interleaving stmt:
4985 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4986 ...}> */
4987 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4988 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4989 vect2, perm_mask_high);
4990 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4991 (*result_chain)[2*j] = high;
4993 /* Create interleaving stmt:
4994 low = VEC_PERM_EXPR <vect1, vect2,
4995 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4996 ...}> */
4997 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4998 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4999 vect2, perm_mask_low);
5000 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5001 (*result_chain)[2*j+1] = low;
5003 memcpy (dr_chain.address (), result_chain->address (),
5004 length * sizeof (tree));
5009 /* Function vect_setup_realignment
5011 This function is called when vectorizing an unaligned load using
5012 the dr_explicit_realign[_optimized] scheme.
5013 This function generates the following code at the loop prolog:
5015 p = initial_addr;
5016 x msq_init = *(floor(p)); # prolog load
5017 realignment_token = call target_builtin;
5018 loop:
5019 x msq = phi (msq_init, ---)
5021 The stmts marked with x are generated only for the case of
5022 dr_explicit_realign_optimized.
5024 The code above sets up a new (vector) pointer, pointing to the first
5025 location accessed by STMT, and a "floor-aligned" load using that pointer.
5026 It also generates code to compute the "realignment-token" (if the relevant
5027 target hook was defined), and creates a phi-node at the loop-header bb
5028 whose arguments are the result of the prolog-load (created by this
5029 function) and the result of a load that takes place in the loop (to be
5030 created by the caller to this function).
5032 For the case of dr_explicit_realign_optimized:
5033 The caller to this function uses the phi-result (msq) to create the
5034 realignment code inside the loop, and sets up the missing phi argument,
5035 as follows:
5036 loop:
5037 msq = phi (msq_init, lsq)
5038 lsq = *(floor(p')); # load in loop
5039 result = realign_load (msq, lsq, realignment_token);
5041 For the case of dr_explicit_realign:
5042 loop:
5043 msq = *(floor(p)); # load in loop
5044 p' = p + (VS-1);
5045 lsq = *(floor(p')); # load in loop
5046 result = realign_load (msq, lsq, realignment_token);
5048 Input:
5049 STMT - (scalar) load stmt to be vectorized. This load accesses
5050 a memory location that may be unaligned.
5051 BSI - place where new code is to be inserted.
5052 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5053 is used.
5055 Output:
5056 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5057 target hook, if defined.
5058 Return value - the result of the loop-header phi node. */
5060 tree
5061 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5062 tree *realignment_token,
5063 enum dr_alignment_support alignment_support_scheme,
5064 tree init_addr,
5065 struct loop **at_loop)
5067 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5068 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5069 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5070 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5071 struct loop *loop = NULL;
5072 edge pe = NULL;
5073 tree scalar_dest = gimple_assign_lhs (stmt);
5074 tree vec_dest;
5075 gimple *inc;
5076 tree ptr;
5077 tree data_ref;
5078 basic_block new_bb;
5079 tree msq_init = NULL_TREE;
5080 tree new_temp;
5081 gphi *phi_stmt;
5082 tree msq = NULL_TREE;
5083 gimple_seq stmts = NULL;
5084 bool inv_p;
5085 bool compute_in_loop = false;
5086 bool nested_in_vect_loop = false;
5087 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5088 struct loop *loop_for_initial_load = NULL;
5090 if (loop_vinfo)
5092 loop = LOOP_VINFO_LOOP (loop_vinfo);
5093 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5096 gcc_assert (alignment_support_scheme == dr_explicit_realign
5097 || alignment_support_scheme == dr_explicit_realign_optimized);
5099 /* We need to generate three things:
5100 1. the misalignment computation
5101 2. the extra vector load (for the optimized realignment scheme).
5102 3. the phi node for the two vectors from which the realignment is
5103 done (for the optimized realignment scheme). */
5105 /* 1. Determine where to generate the misalignment computation.
5107 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5108 calculation will be generated by this function, outside the loop (in the
5109 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5110 caller, inside the loop.
5112 Background: If the misalignment remains fixed throughout the iterations of
5113 the loop, then both realignment schemes are applicable, and also the
5114 misalignment computation can be done outside LOOP. This is because we are
5115 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5116 are a multiple of VS (the Vector Size), and therefore the misalignment in
5117 different vectorized LOOP iterations is always the same.
5118 The problem arises only if the memory access is in an inner-loop nested
5119 inside LOOP, which is now being vectorized using outer-loop vectorization.
5120 This is the only case when the misalignment of the memory access may not
5121 remain fixed throughout the iterations of the inner-loop (as explained in
5122 detail in vect_supportable_dr_alignment). In this case, not only is the
5123 optimized realignment scheme not applicable, but also the misalignment
5124 computation (and generation of the realignment token that is passed to
5125 REALIGN_LOAD) have to be done inside the loop.
5127 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5128 or not, which in turn determines if the misalignment is computed inside
5129 the inner-loop, or outside LOOP. */
5131 if (init_addr != NULL_TREE || !loop_vinfo)
5133 compute_in_loop = true;
5134 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5138 /* 2. Determine where to generate the extra vector load.
5140 For the optimized realignment scheme, instead of generating two vector
5141 loads in each iteration, we generate a single extra vector load in the
5142 preheader of the loop, and in each iteration reuse the result of the
5143 vector load from the previous iteration. In case the memory access is in
5144 an inner-loop nested inside LOOP, which is now being vectorized using
5145 outer-loop vectorization, we need to determine whether this initial vector
5146 load should be generated at the preheader of the inner-loop, or can be
5147 generated at the preheader of LOOP. If the memory access has no evolution
5148 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5149 to be generated inside LOOP (in the preheader of the inner-loop). */
5151 if (nested_in_vect_loop)
5153 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5154 bool invariant_in_outerloop =
5155 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5156 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5158 else
5159 loop_for_initial_load = loop;
5160 if (at_loop)
5161 *at_loop = loop_for_initial_load;
5163 if (loop_for_initial_load)
5164 pe = loop_preheader_edge (loop_for_initial_load);
5166 /* 3. For the case of the optimized realignment, create the first vector
5167 load at the loop preheader. */
5169 if (alignment_support_scheme == dr_explicit_realign_optimized)
5171 /* Create msq_init = *(floor(p1)) in the loop preheader */
5172 gassign *new_stmt;
5174 gcc_assert (!compute_in_loop);
5175 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5176 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5177 NULL_TREE, &init_addr, NULL, &inc,
5178 true, &inv_p);
5179 if (TREE_CODE (ptr) == SSA_NAME)
5180 new_temp = copy_ssa_name (ptr);
5181 else
5182 new_temp = make_ssa_name (TREE_TYPE (ptr));
5183 new_stmt = gimple_build_assign
5184 (new_temp, BIT_AND_EXPR, ptr,
5185 build_int_cst (TREE_TYPE (ptr),
5186 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5187 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5188 gcc_assert (!new_bb);
5189 data_ref
5190 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5191 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5192 new_stmt = gimple_build_assign (vec_dest, data_ref);
5193 new_temp = make_ssa_name (vec_dest, new_stmt);
5194 gimple_assign_set_lhs (new_stmt, new_temp);
5195 if (pe)
5197 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5198 gcc_assert (!new_bb);
5200 else
5201 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5203 msq_init = gimple_assign_lhs (new_stmt);
5206 /* 4. Create realignment token using a target builtin, if available.
5207 It is done either inside the containing loop, or before LOOP (as
5208 determined above). */
5210 if (targetm.vectorize.builtin_mask_for_load)
5212 gcall *new_stmt;
5213 tree builtin_decl;
5215 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5216 if (!init_addr)
5218 /* Generate the INIT_ADDR computation outside LOOP. */
5219 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5220 NULL_TREE, loop);
5221 if (loop)
5223 pe = loop_preheader_edge (loop);
5224 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5225 gcc_assert (!new_bb);
5227 else
5228 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5231 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5232 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5233 vec_dest =
5234 vect_create_destination_var (scalar_dest,
5235 gimple_call_return_type (new_stmt));
5236 new_temp = make_ssa_name (vec_dest, new_stmt);
5237 gimple_call_set_lhs (new_stmt, new_temp);
5239 if (compute_in_loop)
5240 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5241 else
5243 /* Generate the misalignment computation outside LOOP. */
5244 pe = loop_preheader_edge (loop);
5245 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5246 gcc_assert (!new_bb);
5249 *realignment_token = gimple_call_lhs (new_stmt);
5251 /* The result of the CALL_EXPR to this builtin is determined from
5252 the value of the parameter and no global variables are touched
5253 which makes the builtin a "const" function. Requiring the
5254 builtin to have the "const" attribute makes it unnecessary
5255 to call mark_call_clobbered. */
5256 gcc_assert (TREE_READONLY (builtin_decl));
5259 if (alignment_support_scheme == dr_explicit_realign)
5260 return msq;
5262 gcc_assert (!compute_in_loop);
5263 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5266 /* 5. Create msq = phi <msq_init, lsq> in loop */
5268 pe = loop_preheader_edge (containing_loop);
5269 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5270 msq = make_ssa_name (vec_dest);
5271 phi_stmt = create_phi_node (msq, containing_loop->header);
5272 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5274 return msq;
5278 /* Function vect_grouped_load_supported.
5280 COUNT is the size of the load group (the number of statements plus the
5281 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5282 only one statement, with a gap of COUNT - 1.
5284 Returns true if a suitable permute exists. */
5286 bool
5287 vect_grouped_load_supported (tree vectype, bool single_element_p,
5288 unsigned HOST_WIDE_INT count)
5290 machine_mode mode = TYPE_MODE (vectype);
5292 /* If this is single-element interleaving with an element distance
5293 that leaves unused vector loads around punt - we at least create
5294 very sub-optimal code in that case (and blow up memory,
5295 see PR65518). */
5296 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5298 if (dump_enabled_p ())
5299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5300 "single-element interleaving not supported "
5301 "for not adjacent vector loads\n");
5302 return false;
5305 /* vect_permute_load_chain requires the group size to be equal to 3 or
5306 be a power of two. */
5307 if (count != 3 && exact_log2 (count) == -1)
5309 if (dump_enabled_p ())
5310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5311 "the size of the group of accesses"
5312 " is not a power of 2 or not equal to 3\n");
5313 return false;
5316 /* Check that the permutation is supported. */
5317 if (VECTOR_MODE_P (mode))
5319 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5320 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5322 if (count == 3)
5324 unsigned int k;
5325 for (k = 0; k < 3; k++)
5327 for (i = 0; i < nelt; i++)
5328 if (3 * i + k < 2 * nelt)
5329 sel[i] = 3 * i + k;
5330 else
5331 sel[i] = 0;
5332 if (!can_vec_perm_p (mode, false, sel))
5334 if (dump_enabled_p ())
5335 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5336 "shuffle of 3 loads is not supported by"
5337 " target\n");
5338 return false;
5340 for (i = 0, j = 0; i < nelt; i++)
5341 if (3 * i + k < 2 * nelt)
5342 sel[i] = i;
5343 else
5344 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5345 if (!can_vec_perm_p (mode, false, sel))
5347 if (dump_enabled_p ())
5348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5349 "shuffle of 3 loads is not supported by"
5350 " target\n");
5351 return false;
5354 return true;
5356 else
5358 /* If length is not equal to 3 then only power of 2 is supported. */
5359 gcc_assert (pow2p_hwi (count));
5360 for (i = 0; i < nelt; i++)
5361 sel[i] = i * 2;
5362 if (can_vec_perm_p (mode, false, sel))
5364 for (i = 0; i < nelt; i++)
5365 sel[i] = i * 2 + 1;
5366 if (can_vec_perm_p (mode, false, sel))
5367 return true;
5372 if (dump_enabled_p ())
5373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5374 "extract even/odd not supported by target\n");
5375 return false;
5378 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5379 type VECTYPE. */
5381 bool
5382 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5384 return vect_lanes_optab_supported_p ("vec_load_lanes",
5385 vec_load_lanes_optab,
5386 vectype, count);
5389 /* Function vect_permute_load_chain.
5391 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5392 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5393 the input data correctly. Return the final references for loads in
5394 RESULT_CHAIN.
5396 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5397 The input is 4 vectors each containing 8 elements. We assign a number to each
5398 element, the input sequence is:
5400 1st vec: 0 1 2 3 4 5 6 7
5401 2nd vec: 8 9 10 11 12 13 14 15
5402 3rd vec: 16 17 18 19 20 21 22 23
5403 4th vec: 24 25 26 27 28 29 30 31
5405 The output sequence should be:
5407 1st vec: 0 4 8 12 16 20 24 28
5408 2nd vec: 1 5 9 13 17 21 25 29
5409 3rd vec: 2 6 10 14 18 22 26 30
5410 4th vec: 3 7 11 15 19 23 27 31
5412 i.e., the first output vector should contain the first elements of each
5413 interleaving group, etc.
5415 We use extract_even/odd instructions to create such output. The input of
5416 each extract_even/odd operation is two vectors
5417 1st vec 2nd vec
5418 0 1 2 3 4 5 6 7
5420 and the output is the vector of extracted even/odd elements. The output of
5421 extract_even will be: 0 2 4 6
5422 and of extract_odd: 1 3 5 7
5425 The permutation is done in log LENGTH stages. In each stage extract_even
5426 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5427 their order. In our example,
5429 E1: extract_even (1st vec, 2nd vec)
5430 E2: extract_odd (1st vec, 2nd vec)
5431 E3: extract_even (3rd vec, 4th vec)
5432 E4: extract_odd (3rd vec, 4th vec)
5434 The output for the first stage will be:
5436 E1: 0 2 4 6 8 10 12 14
5437 E2: 1 3 5 7 9 11 13 15
5438 E3: 16 18 20 22 24 26 28 30
5439 E4: 17 19 21 23 25 27 29 31
5441 In order to proceed and create the correct sequence for the next stage (or
5442 for the correct output, if the second stage is the last one, as in our
5443 example), we first put the output of extract_even operation and then the
5444 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5445 The input for the second stage is:
5447 1st vec (E1): 0 2 4 6 8 10 12 14
5448 2nd vec (E3): 16 18 20 22 24 26 28 30
5449 3rd vec (E2): 1 3 5 7 9 11 13 15
5450 4th vec (E4): 17 19 21 23 25 27 29 31
5452 The output of the second stage:
5454 E1: 0 4 8 12 16 20 24 28
5455 E2: 2 6 10 14 18 22 26 30
5456 E3: 1 5 9 13 17 21 25 29
5457 E4: 3 7 11 15 19 23 27 31
5459 And RESULT_CHAIN after reordering:
5461 1st vec (E1): 0 4 8 12 16 20 24 28
5462 2nd vec (E3): 1 5 9 13 17 21 25 29
5463 3rd vec (E2): 2 6 10 14 18 22 26 30
5464 4th vec (E4): 3 7 11 15 19 23 27 31. */
5466 static void
5467 vect_permute_load_chain (vec<tree> dr_chain,
5468 unsigned int length,
5469 gimple *stmt,
5470 gimple_stmt_iterator *gsi,
5471 vec<tree> *result_chain)
5473 tree data_ref, first_vect, second_vect;
5474 tree perm_mask_even, perm_mask_odd;
5475 tree perm3_mask_low, perm3_mask_high;
5476 gimple *perm_stmt;
5477 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5478 unsigned int i, j, log_length = exact_log2 (length);
5479 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5480 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5482 result_chain->quick_grow (length);
5483 memcpy (result_chain->address (), dr_chain.address (),
5484 length * sizeof (tree));
5486 if (length == 3)
5488 unsigned int k;
5490 for (k = 0; k < 3; k++)
5492 for (i = 0; i < nelt; i++)
5493 if (3 * i + k < 2 * nelt)
5494 sel[i] = 3 * i + k;
5495 else
5496 sel[i] = 0;
5497 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5499 for (i = 0, j = 0; i < nelt; i++)
5500 if (3 * i + k < 2 * nelt)
5501 sel[i] = i;
5502 else
5503 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5505 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5507 first_vect = dr_chain[0];
5508 second_vect = dr_chain[1];
5510 /* Create interleaving stmt (low part of):
5511 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5512 ...}> */
5513 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5514 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5515 second_vect, perm3_mask_low);
5516 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5518 /* Create interleaving stmt (high part of):
5519 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5520 ...}> */
5521 first_vect = data_ref;
5522 second_vect = dr_chain[2];
5523 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5524 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5525 second_vect, perm3_mask_high);
5526 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5527 (*result_chain)[k] = data_ref;
5530 else
5532 /* If length is not equal to 3 then only power of 2 is supported. */
5533 gcc_assert (pow2p_hwi (length));
5535 for (i = 0; i < nelt; ++i)
5536 sel[i] = i * 2;
5537 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5539 for (i = 0; i < nelt; ++i)
5540 sel[i] = i * 2 + 1;
5541 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5543 for (i = 0; i < log_length; i++)
5545 for (j = 0; j < length; j += 2)
5547 first_vect = dr_chain[j];
5548 second_vect = dr_chain[j+1];
5550 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5551 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5552 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5553 first_vect, second_vect,
5554 perm_mask_even);
5555 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5556 (*result_chain)[j/2] = data_ref;
5558 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5559 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5560 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5561 first_vect, second_vect,
5562 perm_mask_odd);
5563 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5564 (*result_chain)[j/2+length/2] = data_ref;
5566 memcpy (dr_chain.address (), result_chain->address (),
5567 length * sizeof (tree));
5572 /* Function vect_shift_permute_load_chain.
5574 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5575 sequence of stmts to reorder the input data accordingly.
5576 Return the final references for loads in RESULT_CHAIN.
5577 Return true if successed, false otherwise.
5579 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5580 The input is 3 vectors each containing 8 elements. We assign a
5581 number to each element, the input sequence is:
5583 1st vec: 0 1 2 3 4 5 6 7
5584 2nd vec: 8 9 10 11 12 13 14 15
5585 3rd vec: 16 17 18 19 20 21 22 23
5587 The output sequence should be:
5589 1st vec: 0 3 6 9 12 15 18 21
5590 2nd vec: 1 4 7 10 13 16 19 22
5591 3rd vec: 2 5 8 11 14 17 20 23
5593 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5595 First we shuffle all 3 vectors to get correct elements order:
5597 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5598 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5599 3rd vec: (16 19 22) (17 20 23) (18 21)
5601 Next we unite and shift vector 3 times:
5603 1st step:
5604 shift right by 6 the concatenation of:
5605 "1st vec" and "2nd vec"
5606 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5607 "2nd vec" and "3rd vec"
5608 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5609 "3rd vec" and "1st vec"
5610 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5611 | New vectors |
5613 So that now new vectors are:
5615 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5616 2nd vec: (10 13) (16 19 22) (17 20 23)
5617 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5619 2nd step:
5620 shift right by 5 the concatenation of:
5621 "1st vec" and "3rd vec"
5622 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5623 "2nd vec" and "1st vec"
5624 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5625 "3rd vec" and "2nd vec"
5626 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5627 | New vectors |
5629 So that now new vectors are:
5631 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5632 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5633 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5635 3rd step:
5636 shift right by 5 the concatenation of:
5637 "1st vec" and "1st vec"
5638 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5639 shift right by 3 the concatenation of:
5640 "2nd vec" and "2nd vec"
5641 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5642 | New vectors |
5644 So that now all vectors are READY:
5645 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5646 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5647 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5649 This algorithm is faster than one in vect_permute_load_chain if:
5650 1. "shift of a concatination" is faster than general permutation.
5651 This is usually so.
5652 2. The TARGET machine can't execute vector instructions in parallel.
5653 This is because each step of the algorithm depends on previous.
5654 The algorithm in vect_permute_load_chain is much more parallel.
5656 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5659 static bool
5660 vect_shift_permute_load_chain (vec<tree> dr_chain,
5661 unsigned int length,
5662 gimple *stmt,
5663 gimple_stmt_iterator *gsi,
5664 vec<tree> *result_chain)
5666 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5667 tree perm2_mask1, perm2_mask2, perm3_mask;
5668 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5669 gimple *perm_stmt;
5671 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5672 unsigned int i;
5673 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5674 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5675 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5676 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5678 result_chain->quick_grow (length);
5679 memcpy (result_chain->address (), dr_chain.address (),
5680 length * sizeof (tree));
5682 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5684 unsigned int j, log_length = exact_log2 (length);
5685 for (i = 0; i < nelt / 2; ++i)
5686 sel[i] = i * 2;
5687 for (i = 0; i < nelt / 2; ++i)
5688 sel[nelt / 2 + i] = i * 2 + 1;
5689 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5691 if (dump_enabled_p ())
5692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5693 "shuffle of 2 fields structure is not \
5694 supported by target\n");
5695 return false;
5697 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5699 for (i = 0; i < nelt / 2; ++i)
5700 sel[i] = i * 2 + 1;
5701 for (i = 0; i < nelt / 2; ++i)
5702 sel[nelt / 2 + i] = i * 2;
5703 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5705 if (dump_enabled_p ())
5706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5707 "shuffle of 2 fields structure is not \
5708 supported by target\n");
5709 return false;
5711 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5713 /* Generating permutation constant to shift all elements.
5714 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5715 for (i = 0; i < nelt; i++)
5716 sel[i] = nelt / 2 + i;
5717 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5719 if (dump_enabled_p ())
5720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5721 "shift permutation is not supported by target\n");
5722 return false;
5724 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5726 /* Generating permutation constant to select vector from 2.
5727 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5728 for (i = 0; i < nelt / 2; i++)
5729 sel[i] = i;
5730 for (i = nelt / 2; i < nelt; i++)
5731 sel[i] = nelt + i;
5732 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5734 if (dump_enabled_p ())
5735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5736 "select is not supported by target\n");
5737 return false;
5739 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5741 for (i = 0; i < log_length; i++)
5743 for (j = 0; j < length; j += 2)
5745 first_vect = dr_chain[j];
5746 second_vect = dr_chain[j + 1];
5748 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5749 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5750 first_vect, first_vect,
5751 perm2_mask1);
5752 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5753 vect[0] = data_ref;
5755 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5756 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5757 second_vect, second_vect,
5758 perm2_mask2);
5759 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5760 vect[1] = data_ref;
5762 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5763 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5764 vect[0], vect[1], shift1_mask);
5765 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5766 (*result_chain)[j/2 + length/2] = data_ref;
5768 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5769 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5770 vect[0], vect[1], select_mask);
5771 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5772 (*result_chain)[j/2] = data_ref;
5774 memcpy (dr_chain.address (), result_chain->address (),
5775 length * sizeof (tree));
5777 return true;
5779 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5781 unsigned int k = 0, l = 0;
5783 /* Generating permutation constant to get all elements in rigth order.
5784 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5785 for (i = 0; i < nelt; i++)
5787 if (3 * k + (l % 3) >= nelt)
5789 k = 0;
5790 l += (3 - (nelt % 3));
5792 sel[i] = 3 * k + (l % 3);
5793 k++;
5795 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5797 if (dump_enabled_p ())
5798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5799 "shuffle of 3 fields structure is not \
5800 supported by target\n");
5801 return false;
5803 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5805 /* Generating permutation constant to shift all elements.
5806 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5807 for (i = 0; i < nelt; i++)
5808 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5809 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5811 if (dump_enabled_p ())
5812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5813 "shift permutation is not supported by target\n");
5814 return false;
5816 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5818 /* Generating permutation constant to shift all elements.
5819 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5820 for (i = 0; i < nelt; i++)
5821 sel[i] = 2 * (nelt / 3) + 1 + i;
5822 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5824 if (dump_enabled_p ())
5825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5826 "shift permutation is not supported by target\n");
5827 return false;
5829 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5831 /* Generating permutation constant to shift all elements.
5832 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5833 for (i = 0; i < nelt; i++)
5834 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5835 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5837 if (dump_enabled_p ())
5838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5839 "shift permutation is not supported by target\n");
5840 return false;
5842 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5844 /* Generating permutation constant to shift all elements.
5845 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5846 for (i = 0; i < nelt; i++)
5847 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5848 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5850 if (dump_enabled_p ())
5851 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5852 "shift permutation is not supported by target\n");
5853 return false;
5855 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5857 for (k = 0; k < 3; k++)
5859 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5860 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5861 dr_chain[k], dr_chain[k],
5862 perm3_mask);
5863 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5864 vect[k] = data_ref;
5867 for (k = 0; k < 3; k++)
5869 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5870 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5871 vect[k % 3], vect[(k + 1) % 3],
5872 shift1_mask);
5873 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5874 vect_shift[k] = data_ref;
5877 for (k = 0; k < 3; k++)
5879 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5880 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5881 vect_shift[(4 - k) % 3],
5882 vect_shift[(3 - k) % 3],
5883 shift2_mask);
5884 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5885 vect[k] = data_ref;
5888 (*result_chain)[3 - (nelt % 3)] = vect[2];
5890 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5891 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5892 vect[0], shift3_mask);
5893 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5894 (*result_chain)[nelt % 3] = data_ref;
5896 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5897 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5898 vect[1], shift4_mask);
5899 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5900 (*result_chain)[0] = data_ref;
5901 return true;
5903 return false;
5906 /* Function vect_transform_grouped_load.
5908 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5909 to perform their permutation and ascribe the result vectorized statements to
5910 the scalar statements.
5913 void
5914 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5915 gimple_stmt_iterator *gsi)
5917 machine_mode mode;
5918 vec<tree> result_chain = vNULL;
5920 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5921 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5922 vectors, that are ready for vector computation. */
5923 result_chain.create (size);
5925 /* If reassociation width for vector type is 2 or greater target machine can
5926 execute 2 or more vector instructions in parallel. Otherwise try to
5927 get chain for loads group using vect_shift_permute_load_chain. */
5928 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5929 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5930 || pow2p_hwi (size)
5931 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5932 gsi, &result_chain))
5933 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5934 vect_record_grouped_load_vectors (stmt, result_chain);
5935 result_chain.release ();
5938 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5939 generated as part of the vectorization of STMT. Assign the statement
5940 for each vector to the associated scalar statement. */
5942 void
5943 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5945 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5946 gimple *next_stmt, *new_stmt;
5947 unsigned int i, gap_count;
5948 tree tmp_data_ref;
5950 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5951 Since we scan the chain starting from it's first node, their order
5952 corresponds the order of data-refs in RESULT_CHAIN. */
5953 next_stmt = first_stmt;
5954 gap_count = 1;
5955 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5957 if (!next_stmt)
5958 break;
5960 /* Skip the gaps. Loads created for the gaps will be removed by dead
5961 code elimination pass later. No need to check for the first stmt in
5962 the group, since it always exists.
5963 GROUP_GAP is the number of steps in elements from the previous
5964 access (if there is no gap GROUP_GAP is 1). We skip loads that
5965 correspond to the gaps. */
5966 if (next_stmt != first_stmt
5967 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5969 gap_count++;
5970 continue;
5973 while (next_stmt)
5975 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5976 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5977 copies, and we put the new vector statement in the first available
5978 RELATED_STMT. */
5979 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5980 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5981 else
5983 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5985 gimple *prev_stmt =
5986 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5987 gimple *rel_stmt =
5988 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5989 while (rel_stmt)
5991 prev_stmt = rel_stmt;
5992 rel_stmt =
5993 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5996 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5997 new_stmt;
6001 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6002 gap_count = 1;
6003 /* If NEXT_STMT accesses the same DR as the previous statement,
6004 put the same TMP_DATA_REF as its vectorized statement; otherwise
6005 get the next data-ref from RESULT_CHAIN. */
6006 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6007 break;
6012 /* Function vect_force_dr_alignment_p.
6014 Returns whether the alignment of a DECL can be forced to be aligned
6015 on ALIGNMENT bit boundary. */
6017 bool
6018 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6020 if (!VAR_P (decl))
6021 return false;
6023 if (decl_in_symtab_p (decl)
6024 && !symtab_node::get (decl)->can_increase_alignment_p ())
6025 return false;
6027 if (TREE_STATIC (decl))
6028 return (alignment <= MAX_OFILE_ALIGNMENT);
6029 else
6030 return (alignment <= MAX_STACK_ALIGNMENT);
6034 /* Return whether the data reference DR is supported with respect to its
6035 alignment.
6036 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6037 it is aligned, i.e., check if it is possible to vectorize it with different
6038 alignment. */
6040 enum dr_alignment_support
6041 vect_supportable_dr_alignment (struct data_reference *dr,
6042 bool check_aligned_accesses)
6044 gimple *stmt = DR_STMT (dr);
6045 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6046 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6047 machine_mode mode = TYPE_MODE (vectype);
6048 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6049 struct loop *vect_loop = NULL;
6050 bool nested_in_vect_loop = false;
6052 if (aligned_access_p (dr) && !check_aligned_accesses)
6053 return dr_aligned;
6055 /* For now assume all conditional loads/stores support unaligned
6056 access without any special code. */
6057 if (is_gimple_call (stmt)
6058 && gimple_call_internal_p (stmt)
6059 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6060 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6061 return dr_unaligned_supported;
6063 if (loop_vinfo)
6065 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6066 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6069 /* Possibly unaligned access. */
6071 /* We can choose between using the implicit realignment scheme (generating
6072 a misaligned_move stmt) and the explicit realignment scheme (generating
6073 aligned loads with a REALIGN_LOAD). There are two variants to the
6074 explicit realignment scheme: optimized, and unoptimized.
6075 We can optimize the realignment only if the step between consecutive
6076 vector loads is equal to the vector size. Since the vector memory
6077 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6078 is guaranteed that the misalignment amount remains the same throughout the
6079 execution of the vectorized loop. Therefore, we can create the
6080 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6081 at the loop preheader.
6083 However, in the case of outer-loop vectorization, when vectorizing a
6084 memory access in the inner-loop nested within the LOOP that is now being
6085 vectorized, while it is guaranteed that the misalignment of the
6086 vectorized memory access will remain the same in different outer-loop
6087 iterations, it is *not* guaranteed that is will remain the same throughout
6088 the execution of the inner-loop. This is because the inner-loop advances
6089 with the original scalar step (and not in steps of VS). If the inner-loop
6090 step happens to be a multiple of VS, then the misalignment remains fixed
6091 and we can use the optimized realignment scheme. For example:
6093 for (i=0; i<N; i++)
6094 for (j=0; j<M; j++)
6095 s += a[i+j];
6097 When vectorizing the i-loop in the above example, the step between
6098 consecutive vector loads is 1, and so the misalignment does not remain
6099 fixed across the execution of the inner-loop, and the realignment cannot
6100 be optimized (as illustrated in the following pseudo vectorized loop):
6102 for (i=0; i<N; i+=4)
6103 for (j=0; j<M; j++){
6104 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6105 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6106 // (assuming that we start from an aligned address).
6109 We therefore have to use the unoptimized realignment scheme:
6111 for (i=0; i<N; i+=4)
6112 for (j=k; j<M; j+=4)
6113 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6114 // that the misalignment of the initial address is
6115 // 0).
6117 The loop can then be vectorized as follows:
6119 for (k=0; k<4; k++){
6120 rt = get_realignment_token (&vp[k]);
6121 for (i=0; i<N; i+=4){
6122 v1 = vp[i+k];
6123 for (j=k; j<M; j+=4){
6124 v2 = vp[i+j+VS-1];
6125 va = REALIGN_LOAD <v1,v2,rt>;
6126 vs += va;
6127 v1 = v2;
6130 } */
6132 if (DR_IS_READ (dr))
6134 bool is_packed = false;
6135 tree type = (TREE_TYPE (DR_REF (dr)));
6137 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6138 && (!targetm.vectorize.builtin_mask_for_load
6139 || targetm.vectorize.builtin_mask_for_load ()))
6141 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6143 /* If we are doing SLP then the accesses need not have the
6144 same alignment, instead it depends on the SLP group size. */
6145 if (loop_vinfo
6146 && STMT_SLP_TYPE (stmt_info)
6147 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6148 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6149 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6151 else if (!loop_vinfo
6152 || (nested_in_vect_loop
6153 && (TREE_INT_CST_LOW (DR_STEP (dr))
6154 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6155 return dr_explicit_realign;
6156 else
6157 return dr_explicit_realign_optimized;
6159 if (!known_alignment_for_access_p (dr))
6160 is_packed = not_size_aligned (DR_REF (dr));
6162 if (targetm.vectorize.support_vector_misalignment
6163 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6164 /* Can't software pipeline the loads, but can at least do them. */
6165 return dr_unaligned_supported;
6167 else
6169 bool is_packed = false;
6170 tree type = (TREE_TYPE (DR_REF (dr)));
6172 if (!known_alignment_for_access_p (dr))
6173 is_packed = not_size_aligned (DR_REF (dr));
6175 if (targetm.vectorize.support_vector_misalignment
6176 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6177 return dr_unaligned_supported;
6180 /* Unsupported. */
6181 return dr_unaligned_unsupported;