Add support for SVE gather loads
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobf79be863194cb02038691fba5f6f707d1494d366
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2018 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
54 #include "tree-hash-traits.h"
55 #include "vec-perm-indices.h"
56 #include "internal-fn.h"
58 /* Return true if load- or store-lanes optab OPTAB is implemented for
59 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
61 static bool
62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
63 tree vectype, unsigned HOST_WIDE_INT count)
65 machine_mode mode, array_mode;
66 bool limit_p;
68 mode = TYPE_MODE (vectype);
69 if (!targetm.array_mode (mode, count).exists (&array_mode))
71 poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
75 if (dump_enabled_p ())
76 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
77 "no array mode for %s["
78 HOST_WIDE_INT_PRINT_DEC "]\n",
79 GET_MODE_NAME (mode), count);
80 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
126 /* During the analysis phase, this function is called on arbitrary
127 statements that might not have scalar results. */
128 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
129 return scalar_type;
131 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
133 if (is_gimple_assign (stmt)
134 && (gimple_assign_cast_p (stmt)
135 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
136 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
137 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
139 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
141 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
142 if (rhs < lhs)
143 scalar_type = rhs_type;
146 *lhs_size_unit = lhs;
147 *rhs_size_unit = rhs;
148 return scalar_type;
152 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
153 tested at run-time. Return TRUE if DDR was successfully inserted.
154 Return false if versioning is not supported. */
156 static bool
157 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
159 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
161 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
162 return false;
164 if (!runtime_alias_check_p (ddr, loop,
165 optimize_loop_nest_for_speed_p (loop)))
166 return false;
168 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
169 return true;
173 /* A subroutine of vect_analyze_data_ref_dependence. Handle
174 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
175 distances. These distances are conservatively correct but they don't
176 reflect a guaranteed dependence.
178 Return true if this function does all the work necessary to avoid
179 an alias or false if the caller should use the dependence distances
180 to limit the vectorization factor in the usual way. LOOP_DEPTH is
181 the depth of the loop described by LOOP_VINFO and the other arguments
182 are as for vect_analyze_data_ref_dependence. */
184 static bool
185 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
186 loop_vec_info loop_vinfo,
187 int loop_depth, unsigned int *max_vf)
189 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
190 lambda_vector dist_v;
191 unsigned int i;
192 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
194 int dist = dist_v[loop_depth];
195 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
197 /* If the user asserted safelen >= DIST consecutive iterations
198 can be executed concurrently, assume independence.
200 ??? An alternative would be to add the alias check even
201 in this case, and vectorize the fallback loop with the
202 maximum VF set to safelen. However, if the user has
203 explicitly given a length, it's less likely that that
204 would be a win. */
205 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
207 if ((unsigned int) loop->safelen < *max_vf)
208 *max_vf = loop->safelen;
209 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
210 continue;
213 /* For dependence distances of 2 or more, we have the option
214 of limiting VF or checking for an alias at runtime.
215 Prefer to check at runtime if we can, to avoid limiting
216 the VF unnecessarily when the bases are in fact independent.
218 Note that the alias checks will be removed if the VF ends up
219 being small enough. */
220 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
223 return true;
227 /* Function vect_analyze_data_ref_dependence.
229 Return TRUE if there (might) exist a dependence between a memory-reference
230 DRA and a memory-reference DRB. When versioning for alias may check a
231 dependence at run-time, return FALSE. Adjust *MAX_VF according to
232 the data dependence. */
234 static bool
235 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
236 loop_vec_info loop_vinfo,
237 unsigned int *max_vf)
239 unsigned int i;
240 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
241 struct data_reference *dra = DDR_A (ddr);
242 struct data_reference *drb = DDR_B (ddr);
243 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
244 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
245 lambda_vector dist_v;
246 unsigned int loop_depth;
248 /* In loop analysis all data references should be vectorizable. */
249 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
250 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
251 gcc_unreachable ();
253 /* Independent data accesses. */
254 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
255 return false;
257 if (dra == drb
258 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
259 return false;
261 /* We do not have to consider dependences between accesses that belong
262 to the same group. */
263 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
264 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
265 return false;
267 /* Even if we have an anti-dependence then, as the vectorized loop covers at
268 least two scalar iterations, there is always also a true dependence.
269 As the vectorizer does not re-order loads and stores we can ignore
270 the anti-dependence if TBAA can disambiguate both DRs similar to the
271 case with known negative distance anti-dependences (positive
272 distance anti-dependences would violate TBAA constraints). */
273 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
274 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
275 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
276 get_alias_set (DR_REF (drb))))
277 return false;
279 /* Unknown data dependence. */
280 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
282 /* If user asserted safelen consecutive iterations can be
283 executed concurrently, assume independence. */
284 if (loop->safelen >= 2)
286 if ((unsigned int) loop->safelen < *max_vf)
287 *max_vf = loop->safelen;
288 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
289 return false;
292 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
293 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
295 if (dump_enabled_p ())
297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
298 "versioning for alias not supported for: "
299 "can't determine dependence between ");
300 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
301 DR_REF (dra));
302 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
303 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
304 DR_REF (drb));
305 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
307 return true;
310 if (dump_enabled_p ())
312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
313 "versioning for alias required: "
314 "can't determine dependence between ");
315 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
316 DR_REF (dra));
317 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
319 DR_REF (drb));
320 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
323 /* Add to list of ddrs that need to be tested at run-time. */
324 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
327 /* Known data dependence. */
328 if (DDR_NUM_DIST_VECTS (ddr) == 0)
330 /* If user asserted safelen consecutive iterations can be
331 executed concurrently, assume independence. */
332 if (loop->safelen >= 2)
334 if ((unsigned int) loop->safelen < *max_vf)
335 *max_vf = loop->safelen;
336 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
337 return false;
340 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
341 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
343 if (dump_enabled_p ())
345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
346 "versioning for alias not supported for: "
347 "bad dist vector for ");
348 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
349 DR_REF (dra));
350 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
351 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
352 DR_REF (drb));
353 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
355 return true;
358 if (dump_enabled_p ())
360 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
361 "versioning for alias required: "
362 "bad dist vector for ");
363 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
364 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
365 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
366 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
368 /* Add to list of ddrs that need to be tested at run-time. */
369 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
372 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
374 if (DDR_COULD_BE_INDEPENDENT_P (ddr)
375 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
376 loop_depth, max_vf))
377 return false;
379 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
381 int dist = dist_v[loop_depth];
383 if (dump_enabled_p ())
384 dump_printf_loc (MSG_NOTE, vect_location,
385 "dependence distance = %d.\n", dist);
387 if (dist == 0)
389 if (dump_enabled_p ())
391 dump_printf_loc (MSG_NOTE, vect_location,
392 "dependence distance == 0 between ");
393 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
394 dump_printf (MSG_NOTE, " and ");
395 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
396 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
399 /* When we perform grouped accesses and perform implicit CSE
400 by detecting equal accesses and doing disambiguation with
401 runtime alias tests like for
402 .. = a[i];
403 .. = a[i+1];
404 a[i] = ..;
405 a[i+1] = ..;
406 *p = ..;
407 .. = a[i];
408 .. = a[i+1];
409 where we will end up loading { a[i], a[i+1] } once, make
410 sure that inserting group loads before the first load and
411 stores after the last store will do the right thing.
412 Similar for groups like
413 a[i] = ...;
414 ... = a[i];
415 a[i+1] = ...;
416 where loads from the group interleave with the store. */
417 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
418 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
420 gimple *earlier_stmt;
421 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
422 if (DR_IS_WRITE
423 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
425 if (dump_enabled_p ())
426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
427 "READ_WRITE dependence in interleaving."
428 "\n");
429 return true;
433 continue;
436 if (dist > 0 && DDR_REVERSED_P (ddr))
438 /* If DDR_REVERSED_P the order of the data-refs in DDR was
439 reversed (to make distance vector positive), and the actual
440 distance is negative. */
441 if (dump_enabled_p ())
442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
443 "dependence distance negative.\n");
444 /* Record a negative dependence distance to later limit the
445 amount of stmt copying / unrolling we can perform.
446 Only need to handle read-after-write dependence. */
447 if (DR_IS_READ (drb)
448 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
449 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
450 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
451 continue;
454 unsigned int abs_dist = abs (dist);
455 if (abs_dist >= 2 && abs_dist < *max_vf)
457 /* The dependence distance requires reduction of the maximal
458 vectorization factor. */
459 *max_vf = abs (dist);
460 if (dump_enabled_p ())
461 dump_printf_loc (MSG_NOTE, vect_location,
462 "adjusting maximal vectorization factor to %i\n",
463 *max_vf);
466 if (abs_dist >= *max_vf)
468 /* Dependence distance does not create dependence, as far as
469 vectorization is concerned, in this case. */
470 if (dump_enabled_p ())
471 dump_printf_loc (MSG_NOTE, vect_location,
472 "dependence distance >= VF.\n");
473 continue;
476 if (dump_enabled_p ())
478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
479 "not vectorized, possible dependence "
480 "between data-refs ");
481 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
482 dump_printf (MSG_NOTE, " and ");
483 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
484 dump_printf (MSG_NOTE, "\n");
487 return true;
490 return false;
493 /* Function vect_analyze_data_ref_dependences.
495 Examine all the data references in the loop, and make sure there do not
496 exist any data dependences between them. Set *MAX_VF according to
497 the maximum vectorization factor the data dependences allow. */
499 bool
500 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
501 unsigned int *max_vf)
503 unsigned int i;
504 struct data_dependence_relation *ddr;
506 if (dump_enabled_p ())
507 dump_printf_loc (MSG_NOTE, vect_location,
508 "=== vect_analyze_data_ref_dependences ===\n");
510 LOOP_VINFO_DDRS (loop_vinfo)
511 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
512 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
513 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
514 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
515 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
516 &LOOP_VINFO_DDRS (loop_vinfo),
517 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
518 return false;
520 /* For epilogues we either have no aliases or alias versioning
521 was applied to original loop. Therefore we may just get max_vf
522 using VF of original loop. */
523 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
524 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
525 else
526 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
527 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
528 return false;
530 return true;
534 /* Function vect_slp_analyze_data_ref_dependence.
536 Return TRUE if there (might) exist a dependence between a memory-reference
537 DRA and a memory-reference DRB. When versioning for alias may check a
538 dependence at run-time, return FALSE. Adjust *MAX_VF according to
539 the data dependence. */
541 static bool
542 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
544 struct data_reference *dra = DDR_A (ddr);
545 struct data_reference *drb = DDR_B (ddr);
547 /* We need to check dependences of statements marked as unvectorizable
548 as well, they still can prohibit vectorization. */
550 /* Independent data accesses. */
551 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
552 return false;
554 if (dra == drb)
555 return false;
557 /* Read-read is OK. */
558 if (DR_IS_READ (dra) && DR_IS_READ (drb))
559 return false;
561 /* If dra and drb are part of the same interleaving chain consider
562 them independent. */
563 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
564 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
565 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
566 return false;
568 /* Unknown data dependence. */
569 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
571 if (dump_enabled_p ())
573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
574 "can't determine dependence between ");
575 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
576 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
577 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
578 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
581 else if (dump_enabled_p ())
583 dump_printf_loc (MSG_NOTE, vect_location,
584 "determined dependence between ");
585 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
586 dump_printf (MSG_NOTE, " and ");
587 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
588 dump_printf (MSG_NOTE, "\n");
591 return true;
595 /* Analyze dependences involved in the transform of SLP NODE. STORES
596 contain the vector of scalar stores of this instance if we are
597 disambiguating the loads. */
599 static bool
600 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
601 vec<gimple *> stores, gimple *last_store)
603 /* This walks over all stmts involved in the SLP load/store done
604 in NODE verifying we can sink them up to the last stmt in the
605 group. */
606 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
607 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
609 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
610 if (access == last_access)
611 continue;
612 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
613 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
614 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
616 gimple *stmt = gsi_stmt (gsi);
617 if (! gimple_vuse (stmt)
618 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
619 continue;
621 /* If we couldn't record a (single) data reference for this
622 stmt we have to give up. */
623 /* ??? Here and below if dependence analysis fails we can resort
624 to the alias oracle which can handle more kinds of stmts. */
625 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
626 if (!dr_b)
627 return false;
629 bool dependent = false;
630 /* If we run into a store of this same instance (we've just
631 marked those) then delay dependence checking until we run
632 into the last store because this is where it will have
633 been sunk to (and we verify if we can do that as well). */
634 if (gimple_visited_p (stmt))
636 if (stmt != last_store)
637 continue;
638 unsigned i;
639 gimple *store;
640 FOR_EACH_VEC_ELT (stores, i, store)
642 data_reference *store_dr
643 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
644 ddr_p ddr = initialize_data_dependence_relation
645 (dr_a, store_dr, vNULL);
646 dependent = vect_slp_analyze_data_ref_dependence (ddr);
647 free_dependence_relation (ddr);
648 if (dependent)
649 break;
652 else
654 ddr_p ddr = initialize_data_dependence_relation (dr_a,
655 dr_b, vNULL);
656 dependent = vect_slp_analyze_data_ref_dependence (ddr);
657 free_dependence_relation (ddr);
659 if (dependent)
660 return false;
663 return true;
667 /* Function vect_analyze_data_ref_dependences.
669 Examine all the data references in the basic-block, and make sure there
670 do not exist any data dependences between them. Set *MAX_VF according to
671 the maximum vectorization factor the data dependences allow. */
673 bool
674 vect_slp_analyze_instance_dependence (slp_instance instance)
676 if (dump_enabled_p ())
677 dump_printf_loc (MSG_NOTE, vect_location,
678 "=== vect_slp_analyze_instance_dependence ===\n");
680 /* The stores of this instance are at the root of the SLP tree. */
681 slp_tree store = SLP_INSTANCE_TREE (instance);
682 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
683 store = NULL;
685 /* Verify we can sink stores to the vectorized stmt insert location. */
686 gimple *last_store = NULL;
687 if (store)
689 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
690 return false;
692 /* Mark stores in this instance and remember the last one. */
693 last_store = vect_find_last_scalar_stmt_in_slp (store);
694 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
695 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
698 bool res = true;
700 /* Verify we can sink loads to the vectorized stmt insert location,
701 special-casing stores of this instance. */
702 slp_tree load;
703 unsigned int i;
704 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
705 if (! vect_slp_analyze_node_dependences (instance, load,
706 store
707 ? SLP_TREE_SCALAR_STMTS (store)
708 : vNULL, last_store))
710 res = false;
711 break;
714 /* Unset the visited flag. */
715 if (store)
716 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
717 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
719 return res;
722 /* Record in VINFO the base alignment guarantee given by DRB. STMT is
723 the statement that contains DRB, which is useful for recording in the
724 dump file. */
726 static void
727 vect_record_base_alignment (vec_info *vinfo, gimple *stmt,
728 innermost_loop_behavior *drb)
730 bool existed;
731 innermost_loop_behavior *&entry
732 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
733 if (!existed || entry->base_alignment < drb->base_alignment)
735 entry = drb;
736 if (dump_enabled_p ())
738 dump_printf_loc (MSG_NOTE, vect_location,
739 "recording new base alignment for ");
740 dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
741 dump_printf (MSG_NOTE, "\n");
742 dump_printf_loc (MSG_NOTE, vect_location,
743 " alignment: %d\n", drb->base_alignment);
744 dump_printf_loc (MSG_NOTE, vect_location,
745 " misalignment: %d\n", drb->base_misalignment);
746 dump_printf_loc (MSG_NOTE, vect_location,
747 " based on: ");
748 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
753 /* If the region we're going to vectorize is reached, all unconditional
754 data references occur at least once. We can therefore pool the base
755 alignment guarantees from each unconditional reference. Do this by
756 going through all the data references in VINFO and checking whether
757 the containing statement makes the reference unconditionally. If so,
758 record the alignment of the base address in VINFO so that it can be
759 used for all other references with the same base. */
761 void
762 vect_record_base_alignments (vec_info *vinfo)
764 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
765 struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
766 data_reference *dr;
767 unsigned int i;
768 FOR_EACH_VEC_ELT (vinfo->datarefs, i, dr)
769 if (!DR_IS_CONDITIONAL_IN_STMT (dr))
771 gimple *stmt = DR_STMT (dr);
772 vect_record_base_alignment (vinfo, stmt, &DR_INNERMOST (dr));
774 /* If DR is nested in the loop that is being vectorized, we can also
775 record the alignment of the base wrt the outer loop. */
776 if (loop && nested_in_vect_loop_p (loop, stmt))
778 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
779 vect_record_base_alignment
780 (vinfo, stmt, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
785 /* Return the target alignment for the vectorized form of DR. */
787 static unsigned int
788 vect_calculate_target_alignment (struct data_reference *dr)
790 gimple *stmt = DR_STMT (dr);
791 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
792 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
793 return targetm.vectorize.preferred_vector_alignment (vectype);
796 /* Function vect_compute_data_ref_alignment
798 Compute the misalignment of the data reference DR.
800 Output:
801 1. If during the misalignment computation it is found that the data reference
802 cannot be vectorized then false is returned.
803 2. DR_MISALIGNMENT (DR) is defined.
805 FOR NOW: No analysis is actually performed. Misalignment is calculated
806 only for trivial cases. TODO. */
808 bool
809 vect_compute_data_ref_alignment (struct data_reference *dr)
811 gimple *stmt = DR_STMT (dr);
812 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
813 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
814 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
815 struct loop *loop = NULL;
816 tree ref = DR_REF (dr);
817 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
819 if (dump_enabled_p ())
820 dump_printf_loc (MSG_NOTE, vect_location,
821 "vect_compute_data_ref_alignment:\n");
823 if (loop_vinfo)
824 loop = LOOP_VINFO_LOOP (loop_vinfo);
826 /* Initialize misalignment to unknown. */
827 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
829 innermost_loop_behavior *drb = vect_dr_behavior (dr);
830 bool step_preserves_misalignment_p;
832 unsigned HOST_WIDE_INT vector_alignment
833 = vect_calculate_target_alignment (dr) / BITS_PER_UNIT;
834 DR_TARGET_ALIGNMENT (dr) = vector_alignment;
836 /* No step for BB vectorization. */
837 if (!loop)
839 gcc_assert (integer_zerop (drb->step));
840 step_preserves_misalignment_p = true;
843 /* In case the dataref is in an inner-loop of the loop that is being
844 vectorized (LOOP), we use the base and misalignment information
845 relative to the outer-loop (LOOP). This is ok only if the misalignment
846 stays the same throughout the execution of the inner-loop, which is why
847 we have to check that the stride of the dataref in the inner-loop evenly
848 divides by the vector alignment. */
849 else if (nested_in_vect_loop_p (loop, stmt))
851 step_preserves_misalignment_p
852 = (DR_STEP_ALIGNMENT (dr) % vector_alignment) == 0;
854 if (dump_enabled_p ())
856 if (step_preserves_misalignment_p)
857 dump_printf_loc (MSG_NOTE, vect_location,
858 "inner step divides the vector alignment.\n");
859 else
860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
861 "inner step doesn't divide the vector"
862 " alignment.\n");
866 /* Similarly we can only use base and misalignment information relative to
867 an innermost loop if the misalignment stays the same throughout the
868 execution of the loop. As above, this is the case if the stride of
869 the dataref evenly divides by the alignment. */
870 else
872 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
873 step_preserves_misalignment_p
874 = multiple_p (DR_STEP_ALIGNMENT (dr) * vf, vector_alignment);
876 if (!step_preserves_misalignment_p && dump_enabled_p ())
877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
878 "step doesn't divide the vector alignment.\n");
881 unsigned int base_alignment = drb->base_alignment;
882 unsigned int base_misalignment = drb->base_misalignment;
884 /* Calculate the maximum of the pooled base address alignment and the
885 alignment that we can compute for DR itself. */
886 innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
887 if (entry && base_alignment < (*entry)->base_alignment)
889 base_alignment = (*entry)->base_alignment;
890 base_misalignment = (*entry)->base_misalignment;
893 if (drb->offset_alignment < vector_alignment
894 || !step_preserves_misalignment_p
895 /* We need to know whether the step wrt the vectorized loop is
896 negative when computing the starting misalignment below. */
897 || TREE_CODE (drb->step) != INTEGER_CST)
899 if (dump_enabled_p ())
901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
902 "Unknown alignment for access: ");
903 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
904 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
906 return true;
909 if (base_alignment < vector_alignment)
911 tree base = drb->base_address;
912 if (TREE_CODE (base) == ADDR_EXPR)
913 base = TREE_OPERAND (base, 0);
914 if (!vect_can_force_dr_alignment_p (base,
915 vector_alignment * BITS_PER_UNIT))
917 if (dump_enabled_p ())
919 dump_printf_loc (MSG_NOTE, vect_location,
920 "can't force alignment of ref: ");
921 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
922 dump_printf (MSG_NOTE, "\n");
924 return true;
927 /* Force the alignment of the decl.
928 NOTE: This is the only change to the code we make during
929 the analysis phase, before deciding to vectorize the loop. */
930 if (dump_enabled_p ())
932 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
933 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
934 dump_printf (MSG_NOTE, "\n");
937 DR_VECT_AUX (dr)->base_decl = base;
938 DR_VECT_AUX (dr)->base_misaligned = true;
939 base_misalignment = 0;
941 poly_int64 misalignment
942 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
944 /* If this is a backward running DR then first access in the larger
945 vectype actually is N-1 elements before the address in the DR.
946 Adjust misalign accordingly. */
947 if (tree_int_cst_sgn (drb->step) < 0)
948 /* PLUS because STEP is negative. */
949 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
950 * TREE_INT_CST_LOW (drb->step));
952 unsigned int const_misalignment;
953 if (!known_misalignment (misalignment, vector_alignment,
954 &const_misalignment))
956 if (dump_enabled_p ())
958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
959 "Non-constant misalignment for access: ");
960 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
961 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
963 return true;
966 SET_DR_MISALIGNMENT (dr, const_misalignment);
968 if (dump_enabled_p ())
970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
971 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
972 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
973 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
976 return true;
979 /* Function vect_update_misalignment_for_peel.
980 Sets DR's misalignment
981 - to 0 if it has the same alignment as DR_PEEL,
982 - to the misalignment computed using NPEEL if DR's salignment is known,
983 - to -1 (unknown) otherwise.
985 DR - the data reference whose misalignment is to be adjusted.
986 DR_PEEL - the data reference whose misalignment is being made
987 zero in the vector loop by the peel.
988 NPEEL - the number of iterations in the peel loop if the misalignment
989 of DR_PEEL is known at compile time. */
991 static void
992 vect_update_misalignment_for_peel (struct data_reference *dr,
993 struct data_reference *dr_peel, int npeel)
995 unsigned int i;
996 vec<dr_p> same_aligned_drs;
997 struct data_reference *current_dr;
998 int dr_size = vect_get_scalar_dr_size (dr);
999 int dr_peel_size = vect_get_scalar_dr_size (dr_peel);
1000 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1001 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1003 /* For interleaved data accesses the step in the loop must be multiplied by
1004 the size of the interleaving group. */
1005 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1006 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1007 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1008 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1010 /* It can be assumed that the data refs with the same alignment as dr_peel
1011 are aligned in the vector loop. */
1012 same_aligned_drs
1013 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1014 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1016 if (current_dr != dr)
1017 continue;
1018 gcc_assert (!known_alignment_for_access_p (dr)
1019 || !known_alignment_for_access_p (dr_peel)
1020 || (DR_MISALIGNMENT (dr) / dr_size
1021 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
1022 SET_DR_MISALIGNMENT (dr, 0);
1023 return;
1026 if (known_alignment_for_access_p (dr)
1027 && known_alignment_for_access_p (dr_peel))
1029 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1030 int misal = DR_MISALIGNMENT (dr);
1031 misal += negative ? -npeel * dr_size : npeel * dr_size;
1032 misal &= DR_TARGET_ALIGNMENT (dr) - 1;
1033 SET_DR_MISALIGNMENT (dr, misal);
1034 return;
1037 if (dump_enabled_p ())
1038 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1039 "to unknown (-1).\n");
1040 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
1044 /* Function verify_data_ref_alignment
1046 Return TRUE if DR can be handled with respect to alignment. */
1048 static bool
1049 verify_data_ref_alignment (data_reference_p dr)
1051 enum dr_alignment_support supportable_dr_alignment
1052 = vect_supportable_dr_alignment (dr, false);
1053 if (!supportable_dr_alignment)
1055 if (dump_enabled_p ())
1057 if (DR_IS_READ (dr))
1058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1059 "not vectorized: unsupported unaligned load.");
1060 else
1061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1062 "not vectorized: unsupported unaligned "
1063 "store.");
1065 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1066 DR_REF (dr));
1067 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1069 return false;
1072 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1073 dump_printf_loc (MSG_NOTE, vect_location,
1074 "Vectorizing an unaligned access.\n");
1076 return true;
1079 /* Function vect_verify_datarefs_alignment
1081 Return TRUE if all data references in the loop can be
1082 handled with respect to alignment. */
1084 bool
1085 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1087 vec<data_reference_p> datarefs = vinfo->datarefs;
1088 struct data_reference *dr;
1089 unsigned int i;
1091 FOR_EACH_VEC_ELT (datarefs, i, dr)
1093 gimple *stmt = DR_STMT (dr);
1094 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1096 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1097 continue;
1099 /* For interleaving, only the alignment of the first access matters. */
1100 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1101 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1102 continue;
1104 /* Strided accesses perform only component accesses, alignment is
1105 irrelevant for them. */
1106 if (STMT_VINFO_STRIDED_P (stmt_info)
1107 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1108 continue;
1110 if (! verify_data_ref_alignment (dr))
1111 return false;
1114 return true;
1117 /* Given an memory reference EXP return whether its alignment is less
1118 than its size. */
1120 static bool
1121 not_size_aligned (tree exp)
1123 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1124 return true;
1126 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1127 > get_object_alignment (exp));
1130 /* Function vector_alignment_reachable_p
1132 Return true if vector alignment for DR is reachable by peeling
1133 a few loop iterations. Return false otherwise. */
1135 static bool
1136 vector_alignment_reachable_p (struct data_reference *dr)
1138 gimple *stmt = DR_STMT (dr);
1139 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1140 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1142 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1144 /* For interleaved access we peel only if number of iterations in
1145 the prolog loop ({VF - misalignment}), is a multiple of the
1146 number of the interleaved accesses. */
1147 int elem_size, mis_in_elements;
1149 /* FORNOW: handle only known alignment. */
1150 if (!known_alignment_for_access_p (dr))
1151 return false;
1153 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1154 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1155 elem_size = vector_element_size (vector_size, nelements);
1156 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1158 if (!multiple_p (nelements - mis_in_elements, GROUP_SIZE (stmt_info)))
1159 return false;
1162 /* If misalignment is known at the compile time then allow peeling
1163 only if natural alignment is reachable through peeling. */
1164 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1166 HOST_WIDE_INT elmsize =
1167 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1168 if (dump_enabled_p ())
1170 dump_printf_loc (MSG_NOTE, vect_location,
1171 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1172 dump_printf (MSG_NOTE,
1173 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1175 if (DR_MISALIGNMENT (dr) % elmsize)
1177 if (dump_enabled_p ())
1178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1179 "data size does not divide the misalignment.\n");
1180 return false;
1184 if (!known_alignment_for_access_p (dr))
1186 tree type = TREE_TYPE (DR_REF (dr));
1187 bool is_packed = not_size_aligned (DR_REF (dr));
1188 if (dump_enabled_p ())
1189 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1190 "Unknown misalignment, %snaturally aligned\n",
1191 is_packed ? "not " : "");
1192 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1195 return true;
1199 /* Calculate the cost of the memory access represented by DR. */
1201 static void
1202 vect_get_data_access_cost (struct data_reference *dr,
1203 unsigned int *inside_cost,
1204 unsigned int *outside_cost,
1205 stmt_vector_for_cost *body_cost_vec)
1207 gimple *stmt = DR_STMT (dr);
1208 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1209 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1210 int ncopies;
1212 if (PURE_SLP_STMT (stmt_info))
1213 ncopies = 1;
1214 else
1215 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1217 if (DR_IS_READ (dr))
1218 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1219 NULL, body_cost_vec, false);
1220 else
1221 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1223 if (dump_enabled_p ())
1224 dump_printf_loc (MSG_NOTE, vect_location,
1225 "vect_get_data_access_cost: inside_cost = %d, "
1226 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1230 typedef struct _vect_peel_info
1232 struct data_reference *dr;
1233 int npeel;
1234 unsigned int count;
1235 } *vect_peel_info;
1237 typedef struct _vect_peel_extended_info
1239 struct _vect_peel_info peel_info;
1240 unsigned int inside_cost;
1241 unsigned int outside_cost;
1242 } *vect_peel_extended_info;
1245 /* Peeling hashtable helpers. */
1247 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1249 static inline hashval_t hash (const _vect_peel_info *);
1250 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1253 inline hashval_t
1254 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1256 return (hashval_t) peel_info->npeel;
1259 inline bool
1260 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1262 return (a->npeel == b->npeel);
1266 /* Insert DR into peeling hash table with NPEEL as key. */
1268 static void
1269 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1270 loop_vec_info loop_vinfo, struct data_reference *dr,
1271 int npeel)
1273 struct _vect_peel_info elem, *slot;
1274 _vect_peel_info **new_slot;
1275 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1277 elem.npeel = npeel;
1278 slot = peeling_htab->find (&elem);
1279 if (slot)
1280 slot->count++;
1281 else
1283 slot = XNEW (struct _vect_peel_info);
1284 slot->npeel = npeel;
1285 slot->dr = dr;
1286 slot->count = 1;
1287 new_slot = peeling_htab->find_slot (slot, INSERT);
1288 *new_slot = slot;
1291 if (!supportable_dr_alignment
1292 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1293 slot->count += VECT_MAX_COST;
1297 /* Traverse peeling hash table to find peeling option that aligns maximum
1298 number of data accesses. */
1301 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1302 _vect_peel_extended_info *max)
1304 vect_peel_info elem = *slot;
1306 if (elem->count > max->peel_info.count
1307 || (elem->count == max->peel_info.count
1308 && max->peel_info.npeel > elem->npeel))
1310 max->peel_info.npeel = elem->npeel;
1311 max->peel_info.count = elem->count;
1312 max->peel_info.dr = elem->dr;
1315 return 1;
1318 /* Get the costs of peeling NPEEL iterations checking data access costs
1319 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1320 misalignment will be zero after peeling. */
1322 static void
1323 vect_get_peeling_costs_all_drs (vec<data_reference_p> datarefs,
1324 struct data_reference *dr0,
1325 unsigned int *inside_cost,
1326 unsigned int *outside_cost,
1327 stmt_vector_for_cost *body_cost_vec,
1328 unsigned int npeel,
1329 bool unknown_misalignment)
1331 unsigned i;
1332 data_reference *dr;
1334 FOR_EACH_VEC_ELT (datarefs, i, dr)
1336 gimple *stmt = DR_STMT (dr);
1337 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1338 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1339 continue;
1341 /* For interleaving, only the alignment of the first access
1342 matters. */
1343 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1344 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1345 continue;
1347 /* Strided accesses perform only component accesses, alignment is
1348 irrelevant for them. */
1349 if (STMT_VINFO_STRIDED_P (stmt_info)
1350 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1351 continue;
1353 int save_misalignment;
1354 save_misalignment = DR_MISALIGNMENT (dr);
1355 if (npeel == 0)
1357 else if (unknown_misalignment && dr == dr0)
1358 SET_DR_MISALIGNMENT (dr, 0);
1359 else
1360 vect_update_misalignment_for_peel (dr, dr0, npeel);
1361 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1362 body_cost_vec);
1363 SET_DR_MISALIGNMENT (dr, save_misalignment);
1367 /* Traverse peeling hash table and calculate cost for each peeling option.
1368 Find the one with the lowest cost. */
1371 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1372 _vect_peel_extended_info *min)
1374 vect_peel_info elem = *slot;
1375 int dummy;
1376 unsigned int inside_cost = 0, outside_cost = 0;
1377 gimple *stmt = DR_STMT (elem->dr);
1378 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1379 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1380 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1381 epilogue_cost_vec;
1383 prologue_cost_vec.create (2);
1384 body_cost_vec.create (2);
1385 epilogue_cost_vec.create (2);
1387 vect_get_peeling_costs_all_drs (LOOP_VINFO_DATAREFS (loop_vinfo),
1388 elem->dr, &inside_cost, &outside_cost,
1389 &body_cost_vec, elem->npeel, false);
1391 body_cost_vec.release ();
1393 outside_cost += vect_get_known_peeling_cost
1394 (loop_vinfo, elem->npeel, &dummy,
1395 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1396 &prologue_cost_vec, &epilogue_cost_vec);
1398 /* Prologue and epilogue costs are added to the target model later.
1399 These costs depend only on the scalar iteration cost, the
1400 number of peeling iterations finally chosen, and the number of
1401 misaligned statements. So discard the information found here. */
1402 prologue_cost_vec.release ();
1403 epilogue_cost_vec.release ();
1405 if (inside_cost < min->inside_cost
1406 || (inside_cost == min->inside_cost
1407 && outside_cost < min->outside_cost))
1409 min->inside_cost = inside_cost;
1410 min->outside_cost = outside_cost;
1411 min->peel_info.dr = elem->dr;
1412 min->peel_info.npeel = elem->npeel;
1413 min->peel_info.count = elem->count;
1416 return 1;
1420 /* Choose best peeling option by traversing peeling hash table and either
1421 choosing an option with the lowest cost (if cost model is enabled) or the
1422 option that aligns as many accesses as possible. */
1424 static struct _vect_peel_extended_info
1425 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1426 loop_vec_info loop_vinfo)
1428 struct _vect_peel_extended_info res;
1430 res.peel_info.dr = NULL;
1432 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1434 res.inside_cost = INT_MAX;
1435 res.outside_cost = INT_MAX;
1436 peeling_htab->traverse <_vect_peel_extended_info *,
1437 vect_peeling_hash_get_lowest_cost> (&res);
1439 else
1441 res.peel_info.count = 0;
1442 peeling_htab->traverse <_vect_peel_extended_info *,
1443 vect_peeling_hash_get_most_frequent> (&res);
1444 res.inside_cost = 0;
1445 res.outside_cost = 0;
1448 return res;
1451 /* Return true if the new peeling NPEEL is supported. */
1453 static bool
1454 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1455 unsigned npeel)
1457 unsigned i;
1458 struct data_reference *dr = NULL;
1459 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1460 gimple *stmt;
1461 stmt_vec_info stmt_info;
1462 enum dr_alignment_support supportable_dr_alignment;
1464 /* Ensure that all data refs can be vectorized after the peel. */
1465 FOR_EACH_VEC_ELT (datarefs, i, dr)
1467 int save_misalignment;
1469 if (dr == dr0)
1470 continue;
1472 stmt = DR_STMT (dr);
1473 stmt_info = vinfo_for_stmt (stmt);
1474 /* For interleaving, only the alignment of the first access
1475 matters. */
1476 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1477 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1478 continue;
1480 /* Strided accesses perform only component accesses, alignment is
1481 irrelevant for them. */
1482 if (STMT_VINFO_STRIDED_P (stmt_info)
1483 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1484 continue;
1486 save_misalignment = DR_MISALIGNMENT (dr);
1487 vect_update_misalignment_for_peel (dr, dr0, npeel);
1488 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1489 SET_DR_MISALIGNMENT (dr, save_misalignment);
1491 if (!supportable_dr_alignment)
1492 return false;
1495 return true;
1498 /* Function vect_enhance_data_refs_alignment
1500 This pass will use loop versioning and loop peeling in order to enhance
1501 the alignment of data references in the loop.
1503 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1504 original loop is to be vectorized. Any other loops that are created by
1505 the transformations performed in this pass - are not supposed to be
1506 vectorized. This restriction will be relaxed.
1508 This pass will require a cost model to guide it whether to apply peeling
1509 or versioning or a combination of the two. For example, the scheme that
1510 intel uses when given a loop with several memory accesses, is as follows:
1511 choose one memory access ('p') which alignment you want to force by doing
1512 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1513 other accesses are not necessarily aligned, or (2) use loop versioning to
1514 generate one loop in which all accesses are aligned, and another loop in
1515 which only 'p' is necessarily aligned.
1517 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1518 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1519 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1521 Devising a cost model is the most critical aspect of this work. It will
1522 guide us on which access to peel for, whether to use loop versioning, how
1523 many versions to create, etc. The cost model will probably consist of
1524 generic considerations as well as target specific considerations (on
1525 powerpc for example, misaligned stores are more painful than misaligned
1526 loads).
1528 Here are the general steps involved in alignment enhancements:
1530 -- original loop, before alignment analysis:
1531 for (i=0; i<N; i++){
1532 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1533 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1536 -- After vect_compute_data_refs_alignment:
1537 for (i=0; i<N; i++){
1538 x = q[i]; # DR_MISALIGNMENT(q) = 3
1539 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1542 -- Possibility 1: we do loop versioning:
1543 if (p is aligned) {
1544 for (i=0; i<N; i++){ # loop 1A
1545 x = q[i]; # DR_MISALIGNMENT(q) = 3
1546 p[i] = y; # DR_MISALIGNMENT(p) = 0
1549 else {
1550 for (i=0; i<N; i++){ # loop 1B
1551 x = q[i]; # DR_MISALIGNMENT(q) = 3
1552 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1556 -- Possibility 2: we do loop peeling:
1557 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1558 x = q[i];
1559 p[i] = y;
1561 for (i = 3; i < N; i++){ # loop 2A
1562 x = q[i]; # DR_MISALIGNMENT(q) = 0
1563 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1566 -- Possibility 3: combination of loop peeling and versioning:
1567 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1568 x = q[i];
1569 p[i] = y;
1571 if (p is aligned) {
1572 for (i = 3; i<N; i++){ # loop 3A
1573 x = q[i]; # DR_MISALIGNMENT(q) = 0
1574 p[i] = y; # DR_MISALIGNMENT(p) = 0
1577 else {
1578 for (i = 3; i<N; i++){ # loop 3B
1579 x = q[i]; # DR_MISALIGNMENT(q) = 0
1580 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1584 These loops are later passed to loop_transform to be vectorized. The
1585 vectorizer will use the alignment information to guide the transformation
1586 (whether to generate regular loads/stores, or with special handling for
1587 misalignment). */
1589 bool
1590 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1592 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1593 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1594 enum dr_alignment_support supportable_dr_alignment;
1595 struct data_reference *dr0 = NULL, *first_store = NULL;
1596 struct data_reference *dr;
1597 unsigned int i, j;
1598 bool do_peeling = false;
1599 bool do_versioning = false;
1600 bool stat;
1601 gimple *stmt;
1602 stmt_vec_info stmt_info;
1603 unsigned int npeel = 0;
1604 bool one_misalignment_known = false;
1605 bool one_misalignment_unknown = false;
1606 bool one_dr_unsupportable = false;
1607 struct data_reference *unsupportable_dr = NULL;
1608 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1609 unsigned possible_npeel_number = 1;
1610 tree vectype;
1611 unsigned int mis, same_align_drs_max = 0;
1612 hash_table<peel_info_hasher> peeling_htab (1);
1614 if (dump_enabled_p ())
1615 dump_printf_loc (MSG_NOTE, vect_location,
1616 "=== vect_enhance_data_refs_alignment ===\n");
1618 /* Reset data so we can safely be called multiple times. */
1619 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1620 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1622 /* While cost model enhancements are expected in the future, the high level
1623 view of the code at this time is as follows:
1625 A) If there is a misaligned access then see if peeling to align
1626 this access can make all data references satisfy
1627 vect_supportable_dr_alignment. If so, update data structures
1628 as needed and return true.
1630 B) If peeling wasn't possible and there is a data reference with an
1631 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1632 then see if loop versioning checks can be used to make all data
1633 references satisfy vect_supportable_dr_alignment. If so, update
1634 data structures as needed and return true.
1636 C) If neither peeling nor versioning were successful then return false if
1637 any data reference does not satisfy vect_supportable_dr_alignment.
1639 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1641 Note, Possibility 3 above (which is peeling and versioning together) is not
1642 being done at this time. */
1644 /* (1) Peeling to force alignment. */
1646 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1647 Considerations:
1648 + How many accesses will become aligned due to the peeling
1649 - How many accesses will become unaligned due to the peeling,
1650 and the cost of misaligned accesses.
1651 - The cost of peeling (the extra runtime checks, the increase
1652 in code size). */
1654 FOR_EACH_VEC_ELT (datarefs, i, dr)
1656 stmt = DR_STMT (dr);
1657 stmt_info = vinfo_for_stmt (stmt);
1659 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1660 continue;
1662 /* For interleaving, only the alignment of the first access
1663 matters. */
1664 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1665 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1666 continue;
1668 /* For invariant accesses there is nothing to enhance. */
1669 if (integer_zerop (DR_STEP (dr)))
1670 continue;
1672 /* Strided accesses perform only component accesses, alignment is
1673 irrelevant for them. */
1674 if (STMT_VINFO_STRIDED_P (stmt_info)
1675 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1676 continue;
1678 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1679 do_peeling = vector_alignment_reachable_p (dr);
1680 if (do_peeling)
1682 if (known_alignment_for_access_p (dr))
1684 unsigned int npeel_tmp = 0;
1685 bool negative = tree_int_cst_compare (DR_STEP (dr),
1686 size_zero_node) < 0;
1688 vectype = STMT_VINFO_VECTYPE (stmt_info);
1689 unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1690 unsigned int dr_size = vect_get_scalar_dr_size (dr);
1691 mis = (negative ? DR_MISALIGNMENT (dr) : -DR_MISALIGNMENT (dr));
1692 if (DR_MISALIGNMENT (dr) != 0)
1693 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1695 /* For multiple types, it is possible that the bigger type access
1696 will have more than one peeling option. E.g., a loop with two
1697 types: one of size (vector size / 4), and the other one of
1698 size (vector size / 8). Vectorization factor will 8. If both
1699 accesses are misaligned by 3, the first one needs one scalar
1700 iteration to be aligned, and the second one needs 5. But the
1701 first one will be aligned also by peeling 5 scalar
1702 iterations, and in that case both accesses will be aligned.
1703 Hence, except for the immediate peeling amount, we also want
1704 to try to add full vector size, while we don't exceed
1705 vectorization factor.
1706 We do this automatically for cost model, since we calculate
1707 cost for every peeling option. */
1708 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1710 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1711 ? vf * GROUP_SIZE (stmt_info) : vf);
1712 possible_npeel_number
1713 = vect_get_num_vectors (nscalars, vectype);
1715 /* NPEEL_TMP is 0 when there is no misalignment, but also
1716 allow peeling NELEMENTS. */
1717 if (DR_MISALIGNMENT (dr) == 0)
1718 possible_npeel_number++;
1721 /* Save info about DR in the hash table. Also include peeling
1722 amounts according to the explanation above. */
1723 for (j = 0; j < possible_npeel_number; j++)
1725 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1726 dr, npeel_tmp);
1727 npeel_tmp += target_align / dr_size;
1730 one_misalignment_known = true;
1732 else
1734 /* If we don't know any misalignment values, we prefer
1735 peeling for data-ref that has the maximum number of data-refs
1736 with the same alignment, unless the target prefers to align
1737 stores over load. */
1738 unsigned same_align_drs
1739 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1740 if (!dr0
1741 || same_align_drs_max < same_align_drs)
1743 same_align_drs_max = same_align_drs;
1744 dr0 = dr;
1746 /* For data-refs with the same number of related
1747 accesses prefer the one where the misalign
1748 computation will be invariant in the outermost loop. */
1749 else if (same_align_drs_max == same_align_drs)
1751 struct loop *ivloop0, *ivloop;
1752 ivloop0 = outermost_invariant_loop_for_expr
1753 (loop, DR_BASE_ADDRESS (dr0));
1754 ivloop = outermost_invariant_loop_for_expr
1755 (loop, DR_BASE_ADDRESS (dr));
1756 if ((ivloop && !ivloop0)
1757 || (ivloop && ivloop0
1758 && flow_loop_nested_p (ivloop, ivloop0)))
1759 dr0 = dr;
1762 one_misalignment_unknown = true;
1764 /* Check for data refs with unsupportable alignment that
1765 can be peeled. */
1766 if (!supportable_dr_alignment)
1768 one_dr_unsupportable = true;
1769 unsupportable_dr = dr;
1772 if (!first_store && DR_IS_WRITE (dr))
1773 first_store = dr;
1776 else
1778 if (!aligned_access_p (dr))
1780 if (dump_enabled_p ())
1781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1782 "vector alignment may not be reachable\n");
1783 break;
1788 /* Check if we can possibly peel the loop. */
1789 if (!vect_can_advance_ivs_p (loop_vinfo)
1790 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1791 || loop->inner)
1792 do_peeling = false;
1794 struct _vect_peel_extended_info peel_for_known_alignment;
1795 struct _vect_peel_extended_info peel_for_unknown_alignment;
1796 struct _vect_peel_extended_info best_peel;
1798 peel_for_unknown_alignment.inside_cost = INT_MAX;
1799 peel_for_unknown_alignment.outside_cost = INT_MAX;
1800 peel_for_unknown_alignment.peel_info.count = 0;
1802 if (do_peeling
1803 && one_misalignment_unknown)
1805 /* Check if the target requires to prefer stores over loads, i.e., if
1806 misaligned stores are more expensive than misaligned loads (taking
1807 drs with same alignment into account). */
1808 unsigned int load_inside_cost = 0;
1809 unsigned int load_outside_cost = 0;
1810 unsigned int store_inside_cost = 0;
1811 unsigned int store_outside_cost = 0;
1812 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1814 stmt_vector_for_cost dummy;
1815 dummy.create (2);
1816 vect_get_peeling_costs_all_drs (datarefs, dr0,
1817 &load_inside_cost,
1818 &load_outside_cost,
1819 &dummy, estimated_npeels, true);
1820 dummy.release ();
1822 if (first_store)
1824 dummy.create (2);
1825 vect_get_peeling_costs_all_drs (datarefs, first_store,
1826 &store_inside_cost,
1827 &store_outside_cost,
1828 &dummy, estimated_npeels, true);
1829 dummy.release ();
1831 else
1833 store_inside_cost = INT_MAX;
1834 store_outside_cost = INT_MAX;
1837 if (load_inside_cost > store_inside_cost
1838 || (load_inside_cost == store_inside_cost
1839 && load_outside_cost > store_outside_cost))
1841 dr0 = first_store;
1842 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1843 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1845 else
1847 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1848 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1851 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1852 prologue_cost_vec.create (2);
1853 epilogue_cost_vec.create (2);
1855 int dummy2;
1856 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1857 (loop_vinfo, estimated_npeels, &dummy2,
1858 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1859 &prologue_cost_vec, &epilogue_cost_vec);
1861 prologue_cost_vec.release ();
1862 epilogue_cost_vec.release ();
1864 peel_for_unknown_alignment.peel_info.count = 1
1865 + STMT_VINFO_SAME_ALIGN_REFS
1866 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1869 peel_for_unknown_alignment.peel_info.npeel = 0;
1870 peel_for_unknown_alignment.peel_info.dr = dr0;
1872 best_peel = peel_for_unknown_alignment;
1874 peel_for_known_alignment.inside_cost = INT_MAX;
1875 peel_for_known_alignment.outside_cost = INT_MAX;
1876 peel_for_known_alignment.peel_info.count = 0;
1877 peel_for_known_alignment.peel_info.dr = NULL;
1879 if (do_peeling && one_misalignment_known)
1881 /* Peeling is possible, but there is no data access that is not supported
1882 unless aligned. So we try to choose the best possible peeling from
1883 the hash table. */
1884 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1885 (&peeling_htab, loop_vinfo);
1888 /* Compare costs of peeling for known and unknown alignment. */
1889 if (peel_for_known_alignment.peel_info.dr != NULL
1890 && peel_for_unknown_alignment.inside_cost
1891 >= peel_for_known_alignment.inside_cost)
1893 best_peel = peel_for_known_alignment;
1895 /* If the best peeling for known alignment has NPEEL == 0, perform no
1896 peeling at all except if there is an unsupportable dr that we can
1897 align. */
1898 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1899 do_peeling = false;
1902 /* If there is an unsupportable data ref, prefer this over all choices so far
1903 since we'd have to discard a chosen peeling except when it accidentally
1904 aligned the unsupportable data ref. */
1905 if (one_dr_unsupportable)
1906 dr0 = unsupportable_dr;
1907 else if (do_peeling)
1909 /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1910 TODO: Use nopeel_outside_cost or get rid of it? */
1911 unsigned nopeel_inside_cost = 0;
1912 unsigned nopeel_outside_cost = 0;
1914 stmt_vector_for_cost dummy;
1915 dummy.create (2);
1916 vect_get_peeling_costs_all_drs (datarefs, NULL, &nopeel_inside_cost,
1917 &nopeel_outside_cost, &dummy, 0, false);
1918 dummy.release ();
1920 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1921 costs will be recorded. */
1922 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1923 prologue_cost_vec.create (2);
1924 epilogue_cost_vec.create (2);
1926 int dummy2;
1927 nopeel_outside_cost += vect_get_known_peeling_cost
1928 (loop_vinfo, 0, &dummy2,
1929 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1930 &prologue_cost_vec, &epilogue_cost_vec);
1932 prologue_cost_vec.release ();
1933 epilogue_cost_vec.release ();
1935 npeel = best_peel.peel_info.npeel;
1936 dr0 = best_peel.peel_info.dr;
1938 /* If no peeling is not more expensive than the best peeling we
1939 have so far, don't perform any peeling. */
1940 if (nopeel_inside_cost <= best_peel.inside_cost)
1941 do_peeling = false;
1944 if (do_peeling)
1946 stmt = DR_STMT (dr0);
1947 stmt_info = vinfo_for_stmt (stmt);
1948 vectype = STMT_VINFO_VECTYPE (stmt_info);
1950 if (known_alignment_for_access_p (dr0))
1952 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1953 size_zero_node) < 0;
1954 if (!npeel)
1956 /* Since it's known at compile time, compute the number of
1957 iterations in the peeled loop (the peeling factor) for use in
1958 updating DR_MISALIGNMENT values. The peeling factor is the
1959 vectorization factor minus the misalignment as an element
1960 count. */
1961 mis = negative ? DR_MISALIGNMENT (dr0) : -DR_MISALIGNMENT (dr0);
1962 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
1963 npeel = ((mis & (target_align - 1))
1964 / vect_get_scalar_dr_size (dr0));
1967 /* For interleaved data access every iteration accesses all the
1968 members of the group, therefore we divide the number of iterations
1969 by the group size. */
1970 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1971 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1972 npeel /= GROUP_SIZE (stmt_info);
1974 if (dump_enabled_p ())
1975 dump_printf_loc (MSG_NOTE, vect_location,
1976 "Try peeling by %d\n", npeel);
1979 /* Ensure that all datarefs can be vectorized after the peel. */
1980 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1981 do_peeling = false;
1983 /* Check if all datarefs are supportable and log. */
1984 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1986 stat = vect_verify_datarefs_alignment (loop_vinfo);
1987 if (!stat)
1988 do_peeling = false;
1989 else
1990 return stat;
1993 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1994 if (do_peeling)
1996 unsigned max_allowed_peel
1997 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1998 if (max_allowed_peel != (unsigned)-1)
2000 unsigned max_peel = npeel;
2001 if (max_peel == 0)
2003 unsigned int target_align = DR_TARGET_ALIGNMENT (dr0);
2004 max_peel = target_align / vect_get_scalar_dr_size (dr0) - 1;
2006 if (max_peel > max_allowed_peel)
2008 do_peeling = false;
2009 if (dump_enabled_p ())
2010 dump_printf_loc (MSG_NOTE, vect_location,
2011 "Disable peeling, max peels reached: %d\n", max_peel);
2016 /* Cost model #2 - if peeling may result in a remaining loop not
2017 iterating enough to be vectorized then do not peel. Since this
2018 is a cost heuristic rather than a correctness decision, use the
2019 most likely runtime value for variable vectorization factors. */
2020 if (do_peeling
2021 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2023 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2024 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2025 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2026 < assumed_vf + max_peel)
2027 do_peeling = false;
2030 if (do_peeling)
2032 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2033 If the misalignment of DR_i is identical to that of dr0 then set
2034 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
2035 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2036 by the peeling factor times the element size of DR_i (MOD the
2037 vectorization factor times the size). Otherwise, the
2038 misalignment of DR_i must be set to unknown. */
2039 FOR_EACH_VEC_ELT (datarefs, i, dr)
2040 if (dr != dr0)
2042 /* Strided accesses perform only component accesses, alignment
2043 is irrelevant for them. */
2044 stmt_info = vinfo_for_stmt (DR_STMT (dr));
2045 if (STMT_VINFO_STRIDED_P (stmt_info)
2046 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2047 continue;
2049 vect_update_misalignment_for_peel (dr, dr0, npeel);
2052 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
2053 if (npeel)
2054 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2055 else
2056 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2057 = DR_MISALIGNMENT (dr0);
2058 SET_DR_MISALIGNMENT (dr0, 0);
2059 if (dump_enabled_p ())
2061 dump_printf_loc (MSG_NOTE, vect_location,
2062 "Alignment of access forced using peeling.\n");
2063 dump_printf_loc (MSG_NOTE, vect_location,
2064 "Peeling for alignment will be applied.\n");
2067 /* The inside-loop cost will be accounted for in vectorizable_load
2068 and vectorizable_store correctly with adjusted alignments.
2069 Drop the body_cst_vec on the floor here. */
2070 stat = vect_verify_datarefs_alignment (loop_vinfo);
2071 gcc_assert (stat);
2072 return stat;
2076 /* (2) Versioning to force alignment. */
2078 /* Try versioning if:
2079 1) optimize loop for speed
2080 2) there is at least one unsupported misaligned data ref with an unknown
2081 misalignment, and
2082 3) all misaligned data refs with a known misalignment are supported, and
2083 4) the number of runtime alignment checks is within reason. */
2085 do_versioning =
2086 optimize_loop_nest_for_speed_p (loop)
2087 && (!loop->inner); /* FORNOW */
2089 if (do_versioning)
2091 FOR_EACH_VEC_ELT (datarefs, i, dr)
2093 stmt = DR_STMT (dr);
2094 stmt_info = vinfo_for_stmt (stmt);
2096 /* For interleaving, only the alignment of the first access
2097 matters. */
2098 if (aligned_access_p (dr)
2099 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2100 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2101 continue;
2103 if (STMT_VINFO_STRIDED_P (stmt_info))
2105 /* Strided loads perform only component accesses, alignment is
2106 irrelevant for them. */
2107 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2108 continue;
2109 do_versioning = false;
2110 break;
2113 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2115 if (!supportable_dr_alignment)
2117 gimple *stmt;
2118 int mask;
2119 tree vectype;
2121 if (known_alignment_for_access_p (dr)
2122 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2123 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2125 do_versioning = false;
2126 break;
2129 stmt = DR_STMT (dr);
2130 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2131 gcc_assert (vectype);
2133 /* At present we don't support versioning for alignment
2134 with variable VF, since there's no guarantee that the
2135 VF is a power of two. We could relax this if we added
2136 a way of enforcing a power-of-two size. */
2137 unsigned HOST_WIDE_INT size;
2138 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2140 do_versioning = false;
2141 break;
2144 /* The rightmost bits of an aligned address must be zeros.
2145 Construct the mask needed for this test. For example,
2146 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2147 mask must be 15 = 0xf. */
2148 mask = size - 1;
2150 /* FORNOW: use the same mask to test all potentially unaligned
2151 references in the loop. The vectorizer currently supports
2152 a single vector size, see the reference to
2153 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2154 vectorization factor is computed. */
2155 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2156 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2157 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2158 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2159 DR_STMT (dr));
2163 /* Versioning requires at least one misaligned data reference. */
2164 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2165 do_versioning = false;
2166 else if (!do_versioning)
2167 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2170 if (do_versioning)
2172 vec<gimple *> may_misalign_stmts
2173 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2174 gimple *stmt;
2176 /* It can now be assumed that the data references in the statements
2177 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2178 of the loop being vectorized. */
2179 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2181 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2182 dr = STMT_VINFO_DATA_REF (stmt_info);
2183 SET_DR_MISALIGNMENT (dr, 0);
2184 if (dump_enabled_p ())
2185 dump_printf_loc (MSG_NOTE, vect_location,
2186 "Alignment of access forced using versioning.\n");
2189 if (dump_enabled_p ())
2190 dump_printf_loc (MSG_NOTE, vect_location,
2191 "Versioning for alignment will be applied.\n");
2193 /* Peeling and versioning can't be done together at this time. */
2194 gcc_assert (! (do_peeling && do_versioning));
2196 stat = vect_verify_datarefs_alignment (loop_vinfo);
2197 gcc_assert (stat);
2198 return stat;
2201 /* This point is reached if neither peeling nor versioning is being done. */
2202 gcc_assert (! (do_peeling || do_versioning));
2204 stat = vect_verify_datarefs_alignment (loop_vinfo);
2205 return stat;
2209 /* Function vect_find_same_alignment_drs.
2211 Update group and alignment relations according to the chosen
2212 vectorization factor. */
2214 static void
2215 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2217 struct data_reference *dra = DDR_A (ddr);
2218 struct data_reference *drb = DDR_B (ddr);
2219 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2220 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2222 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2223 return;
2225 if (dra == drb)
2226 return;
2228 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2229 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2230 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2231 return;
2233 /* Two references with distance zero have the same alignment. */
2234 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2235 - wi::to_poly_offset (DR_INIT (drb)));
2236 if (maybe_ne (diff, 0))
2238 /* Get the wider of the two alignments. */
2239 unsigned int align_a = (vect_calculate_target_alignment (dra)
2240 / BITS_PER_UNIT);
2241 unsigned int align_b = (vect_calculate_target_alignment (drb)
2242 / BITS_PER_UNIT);
2243 unsigned int max_align = MAX (align_a, align_b);
2245 /* Require the gap to be a multiple of the larger vector alignment. */
2246 if (!multiple_p (diff, max_align))
2247 return;
2250 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2251 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2252 if (dump_enabled_p ())
2254 dump_printf_loc (MSG_NOTE, vect_location,
2255 "accesses have the same alignment: ");
2256 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2257 dump_printf (MSG_NOTE, " and ");
2258 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2259 dump_printf (MSG_NOTE, "\n");
2264 /* Function vect_analyze_data_refs_alignment
2266 Analyze the alignment of the data-references in the loop.
2267 Return FALSE if a data reference is found that cannot be vectorized. */
2269 bool
2270 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2272 if (dump_enabled_p ())
2273 dump_printf_loc (MSG_NOTE, vect_location,
2274 "=== vect_analyze_data_refs_alignment ===\n");
2276 /* Mark groups of data references with same alignment using
2277 data dependence information. */
2278 vec<ddr_p> ddrs = vinfo->ddrs;
2279 struct data_dependence_relation *ddr;
2280 unsigned int i;
2282 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2283 vect_find_same_alignment_drs (ddr);
2285 vec<data_reference_p> datarefs = vinfo->datarefs;
2286 struct data_reference *dr;
2288 vect_record_base_alignments (vinfo);
2289 FOR_EACH_VEC_ELT (datarefs, i, dr)
2291 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2292 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2293 && !vect_compute_data_ref_alignment (dr))
2295 /* Strided accesses perform only component accesses, misalignment
2296 information is irrelevant for them. */
2297 if (STMT_VINFO_STRIDED_P (stmt_info)
2298 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2299 continue;
2301 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2303 "not vectorized: can't calculate alignment "
2304 "for data ref.\n");
2306 return false;
2310 return true;
2314 /* Analyze alignment of DRs of stmts in NODE. */
2316 static bool
2317 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2319 /* We vectorize from the first scalar stmt in the node unless
2320 the node is permuted in which case we start from the first
2321 element in the group. */
2322 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2323 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2324 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2325 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2327 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2328 if (! vect_compute_data_ref_alignment (dr)
2329 /* For creating the data-ref pointer we need alignment of the
2330 first element anyway. */
2331 || (dr != first_dr
2332 && ! vect_compute_data_ref_alignment (first_dr))
2333 || ! verify_data_ref_alignment (dr))
2335 if (dump_enabled_p ())
2336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337 "not vectorized: bad data alignment in basic "
2338 "block.\n");
2339 return false;
2342 return true;
2345 /* Function vect_slp_analyze_instance_alignment
2347 Analyze the alignment of the data-references in the SLP instance.
2348 Return FALSE if a data reference is found that cannot be vectorized. */
2350 bool
2351 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2353 if (dump_enabled_p ())
2354 dump_printf_loc (MSG_NOTE, vect_location,
2355 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2357 slp_tree node;
2358 unsigned i;
2359 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2360 if (! vect_slp_analyze_and_verify_node_alignment (node))
2361 return false;
2363 node = SLP_INSTANCE_TREE (instance);
2364 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2365 && ! vect_slp_analyze_and_verify_node_alignment
2366 (SLP_INSTANCE_TREE (instance)))
2367 return false;
2369 return true;
2373 /* Analyze groups of accesses: check that DR belongs to a group of
2374 accesses of legal size, step, etc. Detect gaps, single element
2375 interleaving, and other special cases. Set grouped access info.
2376 Collect groups of strided stores for further use in SLP analysis.
2377 Worker for vect_analyze_group_access. */
2379 static bool
2380 vect_analyze_group_access_1 (struct data_reference *dr)
2382 tree step = DR_STEP (dr);
2383 tree scalar_type = TREE_TYPE (DR_REF (dr));
2384 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2385 gimple *stmt = DR_STMT (dr);
2386 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2387 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2388 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2389 HOST_WIDE_INT dr_step = -1;
2390 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2391 bool slp_impossible = false;
2393 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2394 size of the interleaving group (including gaps). */
2395 if (tree_fits_shwi_p (step))
2397 dr_step = tree_to_shwi (step);
2398 /* Check that STEP is a multiple of type size. Otherwise there is
2399 a non-element-sized gap at the end of the group which we
2400 cannot represent in GROUP_GAP or GROUP_SIZE.
2401 ??? As we can handle non-constant step fine here we should
2402 simply remove uses of GROUP_GAP between the last and first
2403 element and instead rely on DR_STEP. GROUP_SIZE then would
2404 simply not include that gap. */
2405 if ((dr_step % type_size) != 0)
2407 if (dump_enabled_p ())
2409 dump_printf_loc (MSG_NOTE, vect_location,
2410 "Step ");
2411 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2412 dump_printf (MSG_NOTE,
2413 " is not a multiple of the element size for ");
2414 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2415 dump_printf (MSG_NOTE, "\n");
2417 return false;
2419 groupsize = absu_hwi (dr_step) / type_size;
2421 else
2422 groupsize = 0;
2424 /* Not consecutive access is possible only if it is a part of interleaving. */
2425 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2427 /* Check if it this DR is a part of interleaving, and is a single
2428 element of the group that is accessed in the loop. */
2430 /* Gaps are supported only for loads. STEP must be a multiple of the type
2431 size. */
2432 if (DR_IS_READ (dr)
2433 && (dr_step % type_size) == 0
2434 && groupsize > 0)
2436 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2437 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2438 GROUP_GAP (stmt_info) = groupsize - 1;
2439 if (dump_enabled_p ())
2441 dump_printf_loc (MSG_NOTE, vect_location,
2442 "Detected single element interleaving ");
2443 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2444 dump_printf (MSG_NOTE, " step ");
2445 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2446 dump_printf (MSG_NOTE, "\n");
2449 return true;
2452 if (dump_enabled_p ())
2454 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2455 "not consecutive access ");
2456 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2459 if (bb_vinfo)
2461 /* Mark the statement as unvectorizable. */
2462 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2463 return true;
2466 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2467 STMT_VINFO_STRIDED_P (stmt_info) = true;
2468 return true;
2471 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2473 /* First stmt in the interleaving chain. Check the chain. */
2474 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2475 struct data_reference *data_ref = dr;
2476 unsigned int count = 1;
2477 tree prev_init = DR_INIT (data_ref);
2478 gimple *prev = stmt;
2479 HOST_WIDE_INT diff, gaps = 0;
2481 /* By construction, all group members have INTEGER_CST DR_INITs. */
2482 while (next)
2484 /* Skip same data-refs. In case that two or more stmts share
2485 data-ref (supported only for loads), we vectorize only the first
2486 stmt, and the rest get their vectorized loads from the first
2487 one. */
2488 if (!tree_int_cst_compare (DR_INIT (data_ref),
2489 DR_INIT (STMT_VINFO_DATA_REF (
2490 vinfo_for_stmt (next)))))
2492 if (DR_IS_WRITE (data_ref))
2494 if (dump_enabled_p ())
2495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2496 "Two store stmts share the same dr.\n");
2497 return false;
2500 if (dump_enabled_p ())
2501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2502 "Two or more load stmts share the same dr.\n");
2504 /* For load use the same data-ref load. */
2505 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2507 prev = next;
2508 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2509 continue;
2512 prev = next;
2513 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2515 /* All group members have the same STEP by construction. */
2516 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2518 /* Check that the distance between two accesses is equal to the type
2519 size. Otherwise, we have gaps. */
2520 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2521 - TREE_INT_CST_LOW (prev_init)) / type_size;
2522 if (diff != 1)
2524 /* FORNOW: SLP of accesses with gaps is not supported. */
2525 slp_impossible = true;
2526 if (DR_IS_WRITE (data_ref))
2528 if (dump_enabled_p ())
2529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2530 "interleaved store with gaps\n");
2531 return false;
2534 gaps += diff - 1;
2537 last_accessed_element += diff;
2539 /* Store the gap from the previous member of the group. If there is no
2540 gap in the access, GROUP_GAP is always 1. */
2541 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2543 prev_init = DR_INIT (data_ref);
2544 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2545 /* Count the number of data-refs in the chain. */
2546 count++;
2549 if (groupsize == 0)
2550 groupsize = count + gaps;
2552 /* This could be UINT_MAX but as we are generating code in a very
2553 inefficient way we have to cap earlier. See PR78699 for example. */
2554 if (groupsize > 4096)
2556 if (dump_enabled_p ())
2557 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2558 "group is too large\n");
2559 return false;
2562 /* Check that the size of the interleaving is equal to count for stores,
2563 i.e., that there are no gaps. */
2564 if (groupsize != count
2565 && !DR_IS_READ (dr))
2567 if (dump_enabled_p ())
2568 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2569 "interleaved store with gaps\n");
2570 return false;
2573 /* If there is a gap after the last load in the group it is the
2574 difference between the groupsize and the last accessed
2575 element.
2576 When there is no gap, this difference should be 0. */
2577 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2579 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2580 if (dump_enabled_p ())
2582 dump_printf_loc (MSG_NOTE, vect_location,
2583 "Detected interleaving ");
2584 if (DR_IS_READ (dr))
2585 dump_printf (MSG_NOTE, "load ");
2586 else
2587 dump_printf (MSG_NOTE, "store ");
2588 dump_printf (MSG_NOTE, "of size %u starting with ",
2589 (unsigned)groupsize);
2590 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2591 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2592 dump_printf_loc (MSG_NOTE, vect_location,
2593 "There is a gap of %u elements after the group\n",
2594 GROUP_GAP (vinfo_for_stmt (stmt)));
2597 /* SLP: create an SLP data structure for every interleaving group of
2598 stores for further analysis in vect_analyse_slp. */
2599 if (DR_IS_WRITE (dr) && !slp_impossible)
2601 if (loop_vinfo)
2602 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2603 if (bb_vinfo)
2604 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2608 return true;
2611 /* Analyze groups of accesses: check that DR belongs to a group of
2612 accesses of legal size, step, etc. Detect gaps, single element
2613 interleaving, and other special cases. Set grouped access info.
2614 Collect groups of strided stores for further use in SLP analysis. */
2616 static bool
2617 vect_analyze_group_access (struct data_reference *dr)
2619 if (!vect_analyze_group_access_1 (dr))
2621 /* Dissolve the group if present. */
2622 gimple *next;
2623 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2624 while (stmt)
2626 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2627 next = GROUP_NEXT_ELEMENT (vinfo);
2628 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2629 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2630 stmt = next;
2632 return false;
2634 return true;
2637 /* Analyze the access pattern of the data-reference DR.
2638 In case of non-consecutive accesses call vect_analyze_group_access() to
2639 analyze groups of accesses. */
2641 static bool
2642 vect_analyze_data_ref_access (struct data_reference *dr)
2644 tree step = DR_STEP (dr);
2645 tree scalar_type = TREE_TYPE (DR_REF (dr));
2646 gimple *stmt = DR_STMT (dr);
2647 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2648 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2649 struct loop *loop = NULL;
2651 if (loop_vinfo)
2652 loop = LOOP_VINFO_LOOP (loop_vinfo);
2654 if (loop_vinfo && !step)
2656 if (dump_enabled_p ())
2657 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2658 "bad data-ref access in loop\n");
2659 return false;
2662 /* Allow loads with zero step in inner-loop vectorization. */
2663 if (loop_vinfo && integer_zerop (step))
2665 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2666 if (!nested_in_vect_loop_p (loop, stmt))
2667 return DR_IS_READ (dr);
2668 /* Allow references with zero step for outer loops marked
2669 with pragma omp simd only - it guarantees absence of
2670 loop-carried dependencies between inner loop iterations. */
2671 if (!loop->force_vectorize)
2673 if (dump_enabled_p ())
2674 dump_printf_loc (MSG_NOTE, vect_location,
2675 "zero step in inner loop of nest\n");
2676 return false;
2680 if (loop && nested_in_vect_loop_p (loop, stmt))
2682 /* Interleaved accesses are not yet supported within outer-loop
2683 vectorization for references in the inner-loop. */
2684 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2686 /* For the rest of the analysis we use the outer-loop step. */
2687 step = STMT_VINFO_DR_STEP (stmt_info);
2688 if (integer_zerop (step))
2690 if (dump_enabled_p ())
2691 dump_printf_loc (MSG_NOTE, vect_location,
2692 "zero step in outer loop.\n");
2693 return DR_IS_READ (dr);
2697 /* Consecutive? */
2698 if (TREE_CODE (step) == INTEGER_CST)
2700 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2701 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2702 || (dr_step < 0
2703 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2705 /* Mark that it is not interleaving. */
2706 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2707 return true;
2711 if (loop && nested_in_vect_loop_p (loop, stmt))
2713 if (dump_enabled_p ())
2714 dump_printf_loc (MSG_NOTE, vect_location,
2715 "grouped access in outer loop.\n");
2716 return false;
2720 /* Assume this is a DR handled by non-constant strided load case. */
2721 if (TREE_CODE (step) != INTEGER_CST)
2722 return (STMT_VINFO_STRIDED_P (stmt_info)
2723 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2724 || vect_analyze_group_access (dr)));
2726 /* Not consecutive access - check if it's a part of interleaving group. */
2727 return vect_analyze_group_access (dr);
2730 /* Compare two data-references DRA and DRB to group them into chunks
2731 suitable for grouping. */
2733 static int
2734 dr_group_sort_cmp (const void *dra_, const void *drb_)
2736 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2737 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2738 int cmp;
2740 /* Stabilize sort. */
2741 if (dra == drb)
2742 return 0;
2744 /* DRs in different loops never belong to the same group. */
2745 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2746 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2747 if (loopa != loopb)
2748 return loopa->num < loopb->num ? -1 : 1;
2750 /* Ordering of DRs according to base. */
2751 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2752 DR_BASE_ADDRESS (drb));
2753 if (cmp != 0)
2754 return cmp;
2756 /* And according to DR_OFFSET. */
2757 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2758 if (cmp != 0)
2759 return cmp;
2761 /* Put reads before writes. */
2762 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2763 return DR_IS_READ (dra) ? -1 : 1;
2765 /* Then sort after access size. */
2766 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2767 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2768 if (cmp != 0)
2769 return cmp;
2771 /* And after step. */
2772 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2773 if (cmp != 0)
2774 return cmp;
2776 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2777 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2778 if (cmp == 0)
2779 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2780 return cmp;
2783 /* If OP is the result of a conversion, return the unconverted value,
2784 otherwise return null. */
2786 static tree
2787 strip_conversion (tree op)
2789 if (TREE_CODE (op) != SSA_NAME)
2790 return NULL_TREE;
2791 gimple *stmt = SSA_NAME_DEF_STMT (op);
2792 if (!is_gimple_assign (stmt)
2793 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2794 return NULL_TREE;
2795 return gimple_assign_rhs1 (stmt);
2798 /* Return true if vectorizable_* routines can handle statements STMT1
2799 and STMT2 being in a single group. */
2801 static bool
2802 can_group_stmts_p (gimple *stmt1, gimple *stmt2)
2804 if (gimple_assign_single_p (stmt1))
2805 return gimple_assign_single_p (stmt2);
2807 if (is_gimple_call (stmt1) && gimple_call_internal_p (stmt1))
2809 /* Check for two masked loads or two masked stores. */
2810 if (!is_gimple_call (stmt2) || !gimple_call_internal_p (stmt2))
2811 return false;
2812 internal_fn ifn = gimple_call_internal_fn (stmt1);
2813 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2814 return false;
2815 if (ifn != gimple_call_internal_fn (stmt2))
2816 return false;
2818 /* Check that the masks are the same. Cope with casts of masks,
2819 like those created by build_mask_conversion. */
2820 tree mask1 = gimple_call_arg (stmt1, 2);
2821 tree mask2 = gimple_call_arg (stmt2, 2);
2822 if (!operand_equal_p (mask1, mask2, 0))
2824 mask1 = strip_conversion (mask1);
2825 if (!mask1)
2826 return false;
2827 mask2 = strip_conversion (mask2);
2828 if (!mask2)
2829 return false;
2830 if (!operand_equal_p (mask1, mask2, 0))
2831 return false;
2833 return true;
2836 return false;
2839 /* Function vect_analyze_data_ref_accesses.
2841 Analyze the access pattern of all the data references in the loop.
2843 FORNOW: the only access pattern that is considered vectorizable is a
2844 simple step 1 (consecutive) access.
2846 FORNOW: handle only arrays and pointer accesses. */
2848 bool
2849 vect_analyze_data_ref_accesses (vec_info *vinfo)
2851 unsigned int i;
2852 vec<data_reference_p> datarefs = vinfo->datarefs;
2853 struct data_reference *dr;
2855 if (dump_enabled_p ())
2856 dump_printf_loc (MSG_NOTE, vect_location,
2857 "=== vect_analyze_data_ref_accesses ===\n");
2859 if (datarefs.is_empty ())
2860 return true;
2862 /* Sort the array of datarefs to make building the interleaving chains
2863 linear. Don't modify the original vector's order, it is needed for
2864 determining what dependencies are reversed. */
2865 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2866 datarefs_copy.qsort (dr_group_sort_cmp);
2868 /* Build the interleaving chains. */
2869 for (i = 0; i < datarefs_copy.length () - 1;)
2871 data_reference_p dra = datarefs_copy[i];
2872 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2873 stmt_vec_info lastinfo = NULL;
2874 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2876 ++i;
2877 continue;
2879 for (i = i + 1; i < datarefs_copy.length (); ++i)
2881 data_reference_p drb = datarefs_copy[i];
2882 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2883 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2884 break;
2886 /* ??? Imperfect sorting (non-compatible types, non-modulo
2887 accesses, same accesses) can lead to a group to be artificially
2888 split here as we don't just skip over those. If it really
2889 matters we can push those to a worklist and re-iterate
2890 over them. The we can just skip ahead to the next DR here. */
2892 /* DRs in a different loop should not be put into the same
2893 interleaving group. */
2894 if (gimple_bb (DR_STMT (dra))->loop_father
2895 != gimple_bb (DR_STMT (drb))->loop_father)
2896 break;
2898 /* Check that the data-refs have same first location (except init)
2899 and they are both either store or load (not load and store,
2900 not masked loads or stores). */
2901 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2902 || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2903 DR_BASE_ADDRESS (drb)) != 0
2904 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2905 || !can_group_stmts_p (DR_STMT (dra), DR_STMT (drb)))
2906 break;
2908 /* Check that the data-refs have the same constant size. */
2909 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2910 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2911 if (!tree_fits_uhwi_p (sza)
2912 || !tree_fits_uhwi_p (szb)
2913 || !tree_int_cst_equal (sza, szb))
2914 break;
2916 /* Check that the data-refs have the same step. */
2917 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2918 break;
2920 /* Check the types are compatible.
2921 ??? We don't distinguish this during sorting. */
2922 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2923 TREE_TYPE (DR_REF (drb))))
2924 break;
2926 /* Check that the DR_INITs are compile-time constants. */
2927 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
2928 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
2929 break;
2931 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2932 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2933 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2934 HOST_WIDE_INT init_prev
2935 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
2936 gcc_assert (init_a <= init_b
2937 && init_a <= init_prev
2938 && init_prev <= init_b);
2940 /* Do not place the same access in the interleaving chain twice. */
2941 if (init_b == init_prev)
2943 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
2944 < gimple_uid (DR_STMT (drb)));
2945 /* ??? For now we simply "drop" the later reference which is
2946 otherwise the same rather than finishing off this group.
2947 In the end we'd want to re-process duplicates forming
2948 multiple groups from the refs, likely by just collecting
2949 all candidates (including duplicates and split points
2950 below) in a vector and then process them together. */
2951 continue;
2954 /* If init_b == init_a + the size of the type * k, we have an
2955 interleaving, and DRA is accessed before DRB. */
2956 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2957 if (type_size_a == 0
2958 || (init_b - init_a) % type_size_a != 0)
2959 break;
2961 /* If we have a store, the accesses are adjacent. This splits
2962 groups into chunks we support (we don't support vectorization
2963 of stores with gaps). */
2964 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
2965 break;
2967 /* If the step (if not zero or non-constant) is greater than the
2968 difference between data-refs' inits this splits groups into
2969 suitable sizes. */
2970 if (tree_fits_shwi_p (DR_STEP (dra)))
2972 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2973 if (step != 0 && step <= (init_b - init_a))
2974 break;
2977 if (dump_enabled_p ())
2979 dump_printf_loc (MSG_NOTE, vect_location,
2980 "Detected interleaving ");
2981 if (DR_IS_READ (dra))
2982 dump_printf (MSG_NOTE, "load ");
2983 else
2984 dump_printf (MSG_NOTE, "store ");
2985 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2986 dump_printf (MSG_NOTE, " and ");
2987 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2988 dump_printf (MSG_NOTE, "\n");
2991 /* Link the found element into the group list. */
2992 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2994 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2995 lastinfo = stmtinfo_a;
2997 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2998 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2999 lastinfo = stmtinfo_b;
3003 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3004 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
3005 && !vect_analyze_data_ref_access (dr))
3007 if (dump_enabled_p ())
3008 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3009 "not vectorized: complicated access pattern.\n");
3011 if (is_a <bb_vec_info> (vinfo))
3013 /* Mark the statement as not vectorizable. */
3014 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3015 continue;
3017 else
3019 datarefs_copy.release ();
3020 return false;
3024 datarefs_copy.release ();
3025 return true;
3028 /* Function vect_vfa_segment_size.
3030 Create an expression that computes the size of segment
3031 that will be accessed for a data reference. The functions takes into
3032 account that realignment loads may access one more vector.
3034 Input:
3035 DR: The data reference.
3036 LENGTH_FACTOR: segment length to consider.
3038 Return an expression whose value is the size of segment which will be
3039 accessed by DR. */
3041 static tree
3042 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
3044 tree segment_length;
3046 if (integer_zerop (DR_STEP (dr)))
3047 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3048 else
3049 segment_length = size_binop (MULT_EXPR,
3050 fold_convert (sizetype, DR_STEP (dr)),
3051 fold_convert (sizetype, length_factor));
3053 if (vect_supportable_dr_alignment (dr, false)
3054 == dr_explicit_realign_optimized)
3056 tree vector_size = TYPE_SIZE_UNIT
3057 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
3059 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
3061 return segment_length;
3064 /* Function vect_no_alias_p.
3066 Given data references A and B with equal base and offset, see whether
3067 the alias relation can be decided at compilation time. Return 1 if
3068 it can and the references alias, 0 if it can and the references do
3069 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A
3070 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
3071 respectively. */
3073 static int
3074 vect_compile_time_alias (struct data_reference *a, struct data_reference *b,
3075 tree segment_length_a, tree segment_length_b)
3077 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a));
3078 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b));
3079 poly_uint64 const_length_a;
3080 poly_uint64 const_length_b;
3082 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3083 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3084 [a, a+12) */
3085 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3087 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3088 offset_a = (offset_a + vect_get_scalar_dr_size (a)) - const_length_a;
3090 else
3091 const_length_a = tree_to_poly_uint64 (segment_length_a);
3092 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3094 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3095 offset_b = (offset_b + vect_get_scalar_dr_size (b)) - const_length_b;
3097 else
3098 const_length_b = tree_to_poly_uint64 (segment_length_b);
3100 if (ranges_known_overlap_p (offset_a, const_length_a,
3101 offset_b, const_length_b))
3102 return 1;
3104 if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3105 offset_b, const_length_b))
3106 return 0;
3108 return -1;
3111 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3112 in DDR is >= VF. */
3114 static bool
3115 dependence_distance_ge_vf (data_dependence_relation *ddr,
3116 unsigned int loop_depth, poly_uint64 vf)
3118 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3119 || DDR_NUM_DIST_VECTS (ddr) == 0)
3120 return false;
3122 /* If the dependence is exact, we should have limited the VF instead. */
3123 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3125 unsigned int i;
3126 lambda_vector dist_v;
3127 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3129 HOST_WIDE_INT dist = dist_v[loop_depth];
3130 if (dist != 0
3131 && !(dist > 0 && DDR_REVERSED_P (ddr))
3132 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3133 return false;
3136 if (dump_enabled_p ())
3138 dump_printf_loc (MSG_NOTE, vect_location,
3139 "dependence distance between ");
3140 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3141 dump_printf (MSG_NOTE, " and ");
3142 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3143 dump_printf (MSG_NOTE, " is >= VF\n");
3146 return true;
3149 /* Function vect_prune_runtime_alias_test_list.
3151 Prune a list of ddrs to be tested at run-time by versioning for alias.
3152 Merge several alias checks into one if possible.
3153 Return FALSE if resulting list of ddrs is longer then allowed by
3154 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3156 bool
3157 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3159 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3160 hash_set <tree_pair_hash> compared_objects;
3162 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3163 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3164 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3165 vec<vec_object_pair> &check_unequal_addrs
3166 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3167 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3168 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3170 ddr_p ddr;
3171 unsigned int i;
3172 tree length_factor;
3174 if (dump_enabled_p ())
3175 dump_printf_loc (MSG_NOTE, vect_location,
3176 "=== vect_prune_runtime_alias_test_list ===\n");
3178 if (may_alias_ddrs.is_empty ())
3179 return true;
3181 comp_alias_ddrs.create (may_alias_ddrs.length ());
3183 unsigned int loop_depth
3184 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3185 LOOP_VINFO_LOOP_NEST (loop_vinfo));
3187 /* First, we collect all data ref pairs for aliasing checks. */
3188 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3190 int comp_res;
3191 struct data_reference *dr_a, *dr_b;
3192 gimple *dr_group_first_a, *dr_group_first_b;
3193 tree segment_length_a, segment_length_b;
3194 gimple *stmt_a, *stmt_b;
3196 /* Ignore the alias if the VF we chose ended up being no greater
3197 than the dependence distance. */
3198 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3199 continue;
3201 if (DDR_OBJECT_A (ddr))
3203 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3204 if (!compared_objects.add (new_pair))
3206 if (dump_enabled_p ())
3208 dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3209 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3210 dump_printf (MSG_NOTE, " and ");
3211 dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3212 dump_printf (MSG_NOTE, " have different addresses\n");
3214 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3216 continue;
3219 dr_a = DDR_A (ddr);
3220 stmt_a = DR_STMT (DDR_A (ddr));
3221 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3222 if (dr_group_first_a)
3224 stmt_a = dr_group_first_a;
3225 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3228 dr_b = DDR_B (ddr);
3229 stmt_b = DR_STMT (DDR_B (ddr));
3230 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3231 if (dr_group_first_b)
3233 stmt_b = dr_group_first_b;
3234 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3237 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3238 length_factor = scalar_loop_iters;
3239 else
3240 length_factor = size_int (vect_factor);
3241 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3242 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3244 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3245 DR_BASE_ADDRESS (dr_b));
3246 if (comp_res == 0)
3247 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3248 DR_OFFSET (dr_b));
3250 /* See whether the alias is known at compilation time. */
3251 if (comp_res == 0
3252 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3253 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3254 && poly_int_tree_p (segment_length_a)
3255 && poly_int_tree_p (segment_length_b))
3257 int res = vect_compile_time_alias (dr_a, dr_b,
3258 segment_length_a,
3259 segment_length_b);
3260 if (res == 0)
3261 continue;
3263 if (res == 1)
3265 if (dump_enabled_p ())
3266 dump_printf_loc (MSG_NOTE, vect_location,
3267 "not vectorized: compilation time alias.\n");
3268 return false;
3272 dr_with_seg_len_pair_t dr_with_seg_len_pair
3273 (dr_with_seg_len (dr_a, segment_length_a),
3274 dr_with_seg_len (dr_b, segment_length_b));
3276 /* Canonicalize pairs by sorting the two DR members. */
3277 if (comp_res > 0)
3278 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3280 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3283 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3285 unsigned int count = (comp_alias_ddrs.length ()
3286 + check_unequal_addrs.length ());
3287 dump_printf_loc (MSG_NOTE, vect_location,
3288 "improved number of alias checks from %d to %d\n",
3289 may_alias_ddrs.length (), count);
3290 if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3292 if (dump_enabled_p ())
3293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3294 "number of versioning for alias "
3295 "run-time tests exceeds %d "
3296 "(--param vect-max-version-for-alias-checks)\n",
3297 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3298 return false;
3301 return true;
3304 /* Check whether we can use an internal function for a gather load
3305 or scatter store. READ_P is true for loads and false for stores.
3306 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is
3307 the type of the memory elements being loaded or stored. OFFSET_BITS
3308 is the number of bits in each scalar offset and OFFSET_SIGN is the
3309 sign of the offset. SCALE is the amount by which the offset should
3310 be multiplied *after* it has been converted to address width.
3312 Return true if the function is supported, storing the function
3313 id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT. */
3315 static bool
3316 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3317 tree memory_type, unsigned int offset_bits,
3318 signop offset_sign, int scale,
3319 internal_fn *ifn_out, tree *element_type_out)
3321 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3322 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3323 if (offset_bits > element_bits)
3324 /* Internal functions require the offset to be the same width as
3325 the vector elements. We can extend narrower offsets, but it isn't
3326 safe to truncate wider offsets. */
3327 return false;
3329 if (element_bits != memory_bits)
3330 /* For now the vector elements must be the same width as the
3331 memory elements. */
3332 return false;
3334 /* Work out which function we need. */
3335 internal_fn ifn;
3336 if (read_p)
3337 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3338 else
3339 return false;
3341 /* Test whether the target supports this combination. */
3342 if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3343 offset_sign, scale))
3344 return false;
3346 *ifn_out = ifn;
3347 *element_type_out = TREE_TYPE (vectype);
3348 return true;
3351 /* CALL is a call to an internal gather load or scatter store function.
3352 Describe the operation in INFO. */
3354 static void
3355 vect_describe_gather_scatter_call (gcall *call, gather_scatter_info *info)
3357 stmt_vec_info stmt_info = vinfo_for_stmt (call);
3358 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3359 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3361 info->ifn = gimple_call_internal_fn (call);
3362 info->decl = NULL_TREE;
3363 info->base = gimple_call_arg (call, 0);
3364 info->offset = gimple_call_arg (call, 1);
3365 info->offset_dt = vect_unknown_def_type;
3366 info->offset_vectype = NULL_TREE;
3367 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3368 info->element_type = TREE_TYPE (vectype);
3369 info->memory_type = TREE_TYPE (DR_REF (dr));
3372 /* Return true if a non-affine read or write in STMT is suitable for a
3373 gather load or scatter store. Describe the operation in *INFO if so. */
3375 bool
3376 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3377 gather_scatter_info *info)
3379 HOST_WIDE_INT scale = 1;
3380 poly_int64 pbitpos, pbitsize;
3381 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3382 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3383 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3384 tree offtype = NULL_TREE;
3385 tree decl = NULL_TREE, base, off;
3386 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3387 tree memory_type = TREE_TYPE (DR_REF (dr));
3388 machine_mode pmode;
3389 int punsignedp, reversep, pvolatilep = 0;
3390 internal_fn ifn;
3391 tree element_type;
3392 bool masked_p = false;
3394 /* See whether this is already a call to a gather/scatter internal function.
3395 If not, see whether it's a masked load or store. */
3396 gcall *call = dyn_cast <gcall *> (stmt);
3397 if (call && gimple_call_internal_p (call))
3399 ifn = gimple_call_internal_fn (stmt);
3400 if (internal_gather_scatter_fn_p (ifn))
3402 vect_describe_gather_scatter_call (call, info);
3403 return true;
3405 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3408 /* True if we should aim to use internal functions rather than
3409 built-in functions. */
3410 bool use_ifn_p = (DR_IS_READ (dr)
3411 && supports_vec_gather_load_p ());
3413 base = DR_REF (dr);
3414 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3415 see if we can use the def stmt of the address. */
3416 if (masked_p
3417 && TREE_CODE (base) == MEM_REF
3418 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3419 && integer_zerop (TREE_OPERAND (base, 1))
3420 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3422 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3423 if (is_gimple_assign (def_stmt)
3424 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3425 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3428 /* The gather and scatter builtins need address of the form
3429 loop_invariant + vector * {1, 2, 4, 8}
3431 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3432 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3433 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3434 multiplications and additions in it. To get a vector, we need
3435 a single SSA_NAME that will be defined in the loop and will
3436 contain everything that is not loop invariant and that can be
3437 vectorized. The following code attempts to find such a preexistng
3438 SSA_NAME OFF and put the loop invariants into a tree BASE
3439 that can be gimplified before the loop. */
3440 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3441 &punsignedp, &reversep, &pvolatilep);
3442 gcc_assert (base && !reversep);
3443 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3445 if (TREE_CODE (base) == MEM_REF)
3447 if (!integer_zerop (TREE_OPERAND (base, 1)))
3449 if (off == NULL_TREE)
3450 off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3451 else
3452 off = size_binop (PLUS_EXPR, off,
3453 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3455 base = TREE_OPERAND (base, 0);
3457 else
3458 base = build_fold_addr_expr (base);
3460 if (off == NULL_TREE)
3461 off = size_zero_node;
3463 /* If base is not loop invariant, either off is 0, then we start with just
3464 the constant offset in the loop invariant BASE and continue with base
3465 as OFF, otherwise give up.
3466 We could handle that case by gimplifying the addition of base + off
3467 into some SSA_NAME and use that as off, but for now punt. */
3468 if (!expr_invariant_in_loop_p (loop, base))
3470 if (!integer_zerop (off))
3471 return false;
3472 off = base;
3473 base = size_int (pbytepos);
3475 /* Otherwise put base + constant offset into the loop invariant BASE
3476 and continue with OFF. */
3477 else
3479 base = fold_convert (sizetype, base);
3480 base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3483 /* OFF at this point may be either a SSA_NAME or some tree expression
3484 from get_inner_reference. Try to peel off loop invariants from it
3485 into BASE as long as possible. */
3486 STRIP_NOPS (off);
3487 while (offtype == NULL_TREE)
3489 enum tree_code code;
3490 tree op0, op1, add = NULL_TREE;
3492 if (TREE_CODE (off) == SSA_NAME)
3494 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3496 if (expr_invariant_in_loop_p (loop, off))
3497 return false;
3499 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3500 break;
3502 op0 = gimple_assign_rhs1 (def_stmt);
3503 code = gimple_assign_rhs_code (def_stmt);
3504 op1 = gimple_assign_rhs2 (def_stmt);
3506 else
3508 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3509 return false;
3510 code = TREE_CODE (off);
3511 extract_ops_from_tree (off, &code, &op0, &op1);
3513 switch (code)
3515 case POINTER_PLUS_EXPR:
3516 case PLUS_EXPR:
3517 if (expr_invariant_in_loop_p (loop, op0))
3519 add = op0;
3520 off = op1;
3521 do_add:
3522 add = fold_convert (sizetype, add);
3523 if (scale != 1)
3524 add = size_binop (MULT_EXPR, add, size_int (scale));
3525 base = size_binop (PLUS_EXPR, base, add);
3526 continue;
3528 if (expr_invariant_in_loop_p (loop, op1))
3530 add = op1;
3531 off = op0;
3532 goto do_add;
3534 break;
3535 case MINUS_EXPR:
3536 if (expr_invariant_in_loop_p (loop, op1))
3538 add = fold_convert (sizetype, op1);
3539 add = size_binop (MINUS_EXPR, size_zero_node, add);
3540 off = op0;
3541 goto do_add;
3543 break;
3544 case MULT_EXPR:
3545 if (scale == 1 && tree_fits_shwi_p (op1))
3547 int new_scale = tree_to_shwi (op1);
3548 /* Only treat this as a scaling operation if the target
3549 supports it. */
3550 if (use_ifn_p
3551 && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3552 vectype, memory_type, 1,
3553 TYPE_SIGN (TREE_TYPE (op0)),
3554 new_scale, &ifn,
3555 &element_type))
3556 break;
3557 scale = new_scale;
3558 off = op0;
3559 continue;
3561 break;
3562 case SSA_NAME:
3563 off = op0;
3564 continue;
3565 CASE_CONVERT:
3566 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3567 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3568 break;
3569 if (TYPE_PRECISION (TREE_TYPE (op0))
3570 == TYPE_PRECISION (TREE_TYPE (off)))
3572 off = op0;
3573 continue;
3576 /* The internal functions need the offset to be the same width
3577 as the elements of VECTYPE. Don't include operations that
3578 cast the offset from that width to a different width. */
3579 if (use_ifn_p
3580 && (int_size_in_bytes (TREE_TYPE (vectype))
3581 == int_size_in_bytes (TREE_TYPE (off))))
3582 break;
3584 if (TYPE_PRECISION (TREE_TYPE (op0))
3585 < TYPE_PRECISION (TREE_TYPE (off)))
3587 off = op0;
3588 offtype = TREE_TYPE (off);
3589 STRIP_NOPS (off);
3590 continue;
3592 break;
3593 default:
3594 break;
3596 break;
3599 /* If at the end OFF still isn't a SSA_NAME or isn't
3600 defined in the loop, punt. */
3601 if (TREE_CODE (off) != SSA_NAME
3602 || expr_invariant_in_loop_p (loop, off))
3603 return false;
3605 if (offtype == NULL_TREE)
3606 offtype = TREE_TYPE (off);
3608 if (use_ifn_p)
3610 if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3611 memory_type, TYPE_PRECISION (offtype),
3612 TYPE_SIGN (offtype), scale, &ifn,
3613 &element_type))
3614 return false;
3616 else
3618 if (DR_IS_READ (dr))
3619 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3620 else
3621 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3623 if (!decl)
3624 return false;
3626 ifn = IFN_LAST;
3627 element_type = TREE_TYPE (vectype);
3630 info->ifn = ifn;
3631 info->decl = decl;
3632 info->base = base;
3633 info->offset = off;
3634 info->offset_dt = vect_unknown_def_type;
3635 info->offset_vectype = NULL_TREE;
3636 info->scale = scale;
3637 info->element_type = element_type;
3638 info->memory_type = memory_type;
3639 return true;
3642 /* Function vect_analyze_data_refs.
3644 Find all the data references in the loop or basic block.
3646 The general structure of the analysis of data refs in the vectorizer is as
3647 follows:
3648 1- vect_analyze_data_refs(loop/bb): call
3649 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3650 in the loop/bb and their dependences.
3651 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3652 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3653 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3657 bool
3658 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
3660 struct loop *loop = NULL;
3661 unsigned int i;
3662 struct data_reference *dr;
3663 tree scalar_type;
3665 if (dump_enabled_p ())
3666 dump_printf_loc (MSG_NOTE, vect_location,
3667 "=== vect_analyze_data_refs ===\n");
3669 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3670 loop = LOOP_VINFO_LOOP (loop_vinfo);
3672 /* Go through the data-refs, check that the analysis succeeded. Update
3673 pointer from stmt_vec_info struct to DR and vectype. */
3675 vec<data_reference_p> datarefs = vinfo->datarefs;
3676 FOR_EACH_VEC_ELT (datarefs, i, dr)
3678 gimple *stmt;
3679 stmt_vec_info stmt_info;
3680 tree base, offset, init;
3681 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3682 bool simd_lane_access = false;
3683 poly_uint64 vf;
3685 again:
3686 if (!dr || !DR_REF (dr))
3688 if (dump_enabled_p ())
3689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3690 "not vectorized: unhandled data-ref\n");
3691 return false;
3694 stmt = DR_STMT (dr);
3695 stmt_info = vinfo_for_stmt (stmt);
3697 /* Discard clobbers from the dataref vector. We will remove
3698 clobber stmts during vectorization. */
3699 if (gimple_clobber_p (stmt))
3701 free_data_ref (dr);
3702 if (i == datarefs.length () - 1)
3704 datarefs.pop ();
3705 break;
3707 datarefs.ordered_remove (i);
3708 dr = datarefs[i];
3709 goto again;
3712 /* Check that analysis of the data-ref succeeded. */
3713 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3714 || !DR_STEP (dr))
3716 bool maybe_gather
3717 = DR_IS_READ (dr)
3718 && !TREE_THIS_VOLATILE (DR_REF (dr))
3719 && (targetm.vectorize.builtin_gather != NULL
3720 || supports_vec_gather_load_p ());
3721 bool maybe_scatter
3722 = DR_IS_WRITE (dr)
3723 && !TREE_THIS_VOLATILE (DR_REF (dr))
3724 && targetm.vectorize.builtin_scatter != NULL;
3725 bool maybe_simd_lane_access
3726 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3728 /* If target supports vector gather loads or scatter stores, or if
3729 this might be a SIMD lane access, see if they can't be used. */
3730 if (is_a <loop_vec_info> (vinfo)
3731 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3732 && !nested_in_vect_loop_p (loop, stmt))
3734 struct data_reference *newdr
3735 = create_data_ref (NULL, loop_containing_stmt (stmt),
3736 DR_REF (dr), stmt, !maybe_scatter,
3737 DR_IS_CONDITIONAL_IN_STMT (dr));
3738 gcc_assert (newdr != NULL && DR_REF (newdr));
3739 if (DR_BASE_ADDRESS (newdr)
3740 && DR_OFFSET (newdr)
3741 && DR_INIT (newdr)
3742 && DR_STEP (newdr)
3743 && integer_zerop (DR_STEP (newdr)))
3745 if (maybe_simd_lane_access)
3747 tree off = DR_OFFSET (newdr);
3748 STRIP_NOPS (off);
3749 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3750 && TREE_CODE (off) == MULT_EXPR
3751 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3753 tree step = TREE_OPERAND (off, 1);
3754 off = TREE_OPERAND (off, 0);
3755 STRIP_NOPS (off);
3756 if (CONVERT_EXPR_P (off)
3757 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3758 0)))
3759 < TYPE_PRECISION (TREE_TYPE (off)))
3760 off = TREE_OPERAND (off, 0);
3761 if (TREE_CODE (off) == SSA_NAME)
3763 gimple *def = SSA_NAME_DEF_STMT (off);
3764 tree reft = TREE_TYPE (DR_REF (newdr));
3765 if (is_gimple_call (def)
3766 && gimple_call_internal_p (def)
3767 && (gimple_call_internal_fn (def)
3768 == IFN_GOMP_SIMD_LANE))
3770 tree arg = gimple_call_arg (def, 0);
3771 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3772 arg = SSA_NAME_VAR (arg);
3773 if (arg == loop->simduid
3774 /* For now. */
3775 && tree_int_cst_equal
3776 (TYPE_SIZE_UNIT (reft),
3777 step))
3779 DR_OFFSET (newdr) = ssize_int (0);
3780 DR_STEP (newdr) = step;
3781 DR_OFFSET_ALIGNMENT (newdr)
3782 = BIGGEST_ALIGNMENT;
3783 DR_STEP_ALIGNMENT (newdr)
3784 = highest_pow2_factor (step);
3785 dr = newdr;
3786 simd_lane_access = true;
3792 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3794 dr = newdr;
3795 if (maybe_gather)
3796 gatherscatter = GATHER;
3797 else
3798 gatherscatter = SCATTER;
3801 if (gatherscatter == SG_NONE && !simd_lane_access)
3802 free_data_ref (newdr);
3805 if (gatherscatter == SG_NONE && !simd_lane_access)
3807 if (dump_enabled_p ())
3809 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3810 "not vectorized: data ref analysis "
3811 "failed ");
3812 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3815 if (is_a <bb_vec_info> (vinfo))
3816 break;
3818 return false;
3822 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3824 if (dump_enabled_p ())
3825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3826 "not vectorized: base addr of dr is a "
3827 "constant\n");
3829 if (is_a <bb_vec_info> (vinfo))
3830 break;
3832 if (gatherscatter != SG_NONE || simd_lane_access)
3833 free_data_ref (dr);
3834 return false;
3837 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3839 if (dump_enabled_p ())
3841 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3842 "not vectorized: volatile type ");
3843 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3846 if (is_a <bb_vec_info> (vinfo))
3847 break;
3849 return false;
3852 if (stmt_can_throw_internal (stmt))
3854 if (dump_enabled_p ())
3856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3857 "not vectorized: statement can throw an "
3858 "exception ");
3859 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3862 if (is_a <bb_vec_info> (vinfo))
3863 break;
3865 if (gatherscatter != SG_NONE || simd_lane_access)
3866 free_data_ref (dr);
3867 return false;
3870 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3871 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3873 if (dump_enabled_p ())
3875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3876 "not vectorized: statement is bitfield "
3877 "access ");
3878 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3881 if (is_a <bb_vec_info> (vinfo))
3882 break;
3884 if (gatherscatter != SG_NONE || simd_lane_access)
3885 free_data_ref (dr);
3886 return false;
3889 base = unshare_expr (DR_BASE_ADDRESS (dr));
3890 offset = unshare_expr (DR_OFFSET (dr));
3891 init = unshare_expr (DR_INIT (dr));
3893 if (is_gimple_call (stmt)
3894 && (!gimple_call_internal_p (stmt)
3895 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3896 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3898 if (dump_enabled_p ())
3900 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3901 "not vectorized: dr in a call ");
3902 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3905 if (is_a <bb_vec_info> (vinfo))
3906 break;
3908 if (gatherscatter != SG_NONE || simd_lane_access)
3909 free_data_ref (dr);
3910 return false;
3913 /* Update DR field in stmt_vec_info struct. */
3915 /* If the dataref is in an inner-loop of the loop that is considered for
3916 for vectorization, we also want to analyze the access relative to
3917 the outer-loop (DR contains information only relative to the
3918 inner-most enclosing loop). We do that by building a reference to the
3919 first location accessed by the inner-loop, and analyze it relative to
3920 the outer-loop. */
3921 if (loop && nested_in_vect_loop_p (loop, stmt))
3923 /* Build a reference to the first location accessed by the
3924 inner loop: *(BASE + INIT + OFFSET). By construction,
3925 this address must be invariant in the inner loop, so we
3926 can consider it as being used in the outer loop. */
3927 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3928 init, offset);
3929 tree init_addr = fold_build_pointer_plus (base, init_offset);
3930 tree init_ref = build_fold_indirect_ref (init_addr);
3932 if (dump_enabled_p ())
3934 dump_printf_loc (MSG_NOTE, vect_location,
3935 "analyze in outer loop: ");
3936 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3937 dump_printf (MSG_NOTE, "\n");
3940 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3941 init_ref, loop))
3942 /* dr_analyze_innermost already explained the failure. */
3943 return false;
3945 if (dump_enabled_p ())
3947 dump_printf_loc (MSG_NOTE, vect_location,
3948 "\touter base_address: ");
3949 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3950 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3951 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3952 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3953 STMT_VINFO_DR_OFFSET (stmt_info));
3954 dump_printf (MSG_NOTE,
3955 "\n\touter constant offset from base address: ");
3956 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3957 STMT_VINFO_DR_INIT (stmt_info));
3958 dump_printf (MSG_NOTE, "\n\touter step: ");
3959 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3960 STMT_VINFO_DR_STEP (stmt_info));
3961 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3962 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3963 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3964 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3965 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3966 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3967 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3968 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3972 if (STMT_VINFO_DATA_REF (stmt_info))
3974 if (dump_enabled_p ())
3976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3977 "not vectorized: more than one data ref "
3978 "in stmt: ");
3979 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3982 if (is_a <bb_vec_info> (vinfo))
3983 break;
3985 if (gatherscatter != SG_NONE || simd_lane_access)
3986 free_data_ref (dr);
3987 return false;
3990 STMT_VINFO_DATA_REF (stmt_info) = dr;
3991 if (simd_lane_access)
3993 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3994 free_data_ref (datarefs[i]);
3995 datarefs[i] = dr;
3998 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3999 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
4000 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
4002 if (dump_enabled_p ())
4004 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4005 "not vectorized: base object not addressable "
4006 "for stmt: ");
4007 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4009 if (is_a <bb_vec_info> (vinfo))
4011 /* In BB vectorization the ref can still participate
4012 in dependence analysis, we just can't vectorize it. */
4013 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4014 continue;
4016 return false;
4019 /* Set vectype for STMT. */
4020 scalar_type = TREE_TYPE (DR_REF (dr));
4021 STMT_VINFO_VECTYPE (stmt_info)
4022 = get_vectype_for_scalar_type (scalar_type);
4023 if (!STMT_VINFO_VECTYPE (stmt_info))
4025 if (dump_enabled_p ())
4027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4028 "not vectorized: no vectype for stmt: ");
4029 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4030 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4031 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4032 scalar_type);
4033 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4036 if (is_a <bb_vec_info> (vinfo))
4038 /* No vector type is fine, the ref can still participate
4039 in dependence analysis, we just can't vectorize it. */
4040 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4041 continue;
4044 if (gatherscatter != SG_NONE || simd_lane_access)
4046 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4047 if (gatherscatter != SG_NONE)
4048 free_data_ref (dr);
4050 return false;
4052 else
4054 if (dump_enabled_p ())
4056 dump_printf_loc (MSG_NOTE, vect_location,
4057 "got vectype for stmt: ");
4058 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
4059 dump_generic_expr (MSG_NOTE, TDF_SLIM,
4060 STMT_VINFO_VECTYPE (stmt_info));
4061 dump_printf (MSG_NOTE, "\n");
4065 /* Adjust the minimal vectorization factor according to the
4066 vector type. */
4067 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4068 *min_vf = upper_bound (*min_vf, vf);
4070 if (gatherscatter != SG_NONE)
4072 gather_scatter_info gs_info;
4073 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4074 &gs_info)
4075 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4077 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4078 free_data_ref (dr);
4079 if (dump_enabled_p ())
4081 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4082 (gatherscatter == GATHER) ?
4083 "not vectorized: not suitable for gather "
4084 "load " :
4085 "not vectorized: not suitable for scatter "
4086 "store ");
4087 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4089 return false;
4092 free_data_ref (datarefs[i]);
4093 datarefs[i] = dr;
4094 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4097 else if (is_a <loop_vec_info> (vinfo)
4098 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4100 if (nested_in_vect_loop_p (loop, stmt))
4102 if (dump_enabled_p ())
4104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4105 "not vectorized: not suitable for strided "
4106 "load ");
4107 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4109 return false;
4111 STMT_VINFO_STRIDED_P (stmt_info) = true;
4115 /* If we stopped analysis at the first dataref we could not analyze
4116 when trying to vectorize a basic-block mark the rest of the datarefs
4117 as not vectorizable and truncate the vector of datarefs. That
4118 avoids spending useless time in analyzing their dependence. */
4119 if (i != datarefs.length ())
4121 gcc_assert (is_a <bb_vec_info> (vinfo));
4122 for (unsigned j = i; j < datarefs.length (); ++j)
4124 data_reference_p dr = datarefs[j];
4125 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4126 free_data_ref (dr);
4128 datarefs.truncate (i);
4131 return true;
4135 /* Function vect_get_new_vect_var.
4137 Returns a name for a new variable. The current naming scheme appends the
4138 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4139 the name of vectorizer generated variables, and appends that to NAME if
4140 provided. */
4142 tree
4143 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4145 const char *prefix;
4146 tree new_vect_var;
4148 switch (var_kind)
4150 case vect_simple_var:
4151 prefix = "vect";
4152 break;
4153 case vect_scalar_var:
4154 prefix = "stmp";
4155 break;
4156 case vect_mask_var:
4157 prefix = "mask";
4158 break;
4159 case vect_pointer_var:
4160 prefix = "vectp";
4161 break;
4162 default:
4163 gcc_unreachable ();
4166 if (name)
4168 char* tmp = concat (prefix, "_", name, NULL);
4169 new_vect_var = create_tmp_reg (type, tmp);
4170 free (tmp);
4172 else
4173 new_vect_var = create_tmp_reg (type, prefix);
4175 return new_vect_var;
4178 /* Like vect_get_new_vect_var but return an SSA name. */
4180 tree
4181 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4183 const char *prefix;
4184 tree new_vect_var;
4186 switch (var_kind)
4188 case vect_simple_var:
4189 prefix = "vect";
4190 break;
4191 case vect_scalar_var:
4192 prefix = "stmp";
4193 break;
4194 case vect_pointer_var:
4195 prefix = "vectp";
4196 break;
4197 default:
4198 gcc_unreachable ();
4201 if (name)
4203 char* tmp = concat (prefix, "_", name, NULL);
4204 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4205 free (tmp);
4207 else
4208 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4210 return new_vect_var;
4213 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4215 static void
4216 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr)
4218 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4219 int misalign = DR_MISALIGNMENT (dr);
4220 if (misalign == DR_MISALIGNMENT_UNKNOWN)
4221 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4222 else
4223 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4224 DR_TARGET_ALIGNMENT (dr), misalign);
4227 /* Function vect_create_addr_base_for_vector_ref.
4229 Create an expression that computes the address of the first memory location
4230 that will be accessed for a data reference.
4232 Input:
4233 STMT: The statement containing the data reference.
4234 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4235 OFFSET: Optional. If supplied, it is be added to the initial address.
4236 LOOP: Specify relative to which loop-nest should the address be computed.
4237 For example, when the dataref is in an inner-loop nested in an
4238 outer-loop that is now being vectorized, LOOP can be either the
4239 outer-loop, or the inner-loop. The first memory location accessed
4240 by the following dataref ('in' points to short):
4242 for (i=0; i<N; i++)
4243 for (j=0; j<M; j++)
4244 s += in[i+j]
4246 is as follows:
4247 if LOOP=i_loop: &in (relative to i_loop)
4248 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4249 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4250 initial address. Unlike OFFSET, which is number of elements to
4251 be added, BYTE_OFFSET is measured in bytes.
4253 Output:
4254 1. Return an SSA_NAME whose value is the address of the memory location of
4255 the first vector of the data reference.
4256 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4257 these statement(s) which define the returned SSA_NAME.
4259 FORNOW: We are only handling array accesses with step 1. */
4261 tree
4262 vect_create_addr_base_for_vector_ref (gimple *stmt,
4263 gimple_seq *new_stmt_list,
4264 tree offset,
4265 tree byte_offset)
4267 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4268 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4269 const char *base_name;
4270 tree addr_base;
4271 tree dest;
4272 gimple_seq seq = NULL;
4273 tree vect_ptr_type;
4274 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4275 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4276 innermost_loop_behavior *drb = vect_dr_behavior (dr);
4278 tree data_ref_base = unshare_expr (drb->base_address);
4279 tree base_offset = unshare_expr (drb->offset);
4280 tree init = unshare_expr (drb->init);
4282 if (loop_vinfo)
4283 base_name = get_name (data_ref_base);
4284 else
4286 base_offset = ssize_int (0);
4287 init = ssize_int (0);
4288 base_name = get_name (DR_REF (dr));
4291 /* Create base_offset */
4292 base_offset = size_binop (PLUS_EXPR,
4293 fold_convert (sizetype, base_offset),
4294 fold_convert (sizetype, init));
4296 if (offset)
4298 offset = fold_build2 (MULT_EXPR, sizetype,
4299 fold_convert (sizetype, offset), step);
4300 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4301 base_offset, offset);
4303 if (byte_offset)
4305 byte_offset = fold_convert (sizetype, byte_offset);
4306 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4307 base_offset, byte_offset);
4310 /* base + base_offset */
4311 if (loop_vinfo)
4312 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4313 else
4315 addr_base = build1 (ADDR_EXPR,
4316 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4317 unshare_expr (DR_REF (dr)));
4320 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4321 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4322 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4323 gimple_seq_add_seq (new_stmt_list, seq);
4325 if (DR_PTR_INFO (dr)
4326 && TREE_CODE (addr_base) == SSA_NAME
4327 && !SSA_NAME_PTR_INFO (addr_base))
4329 vect_duplicate_ssa_name_ptr_info (addr_base, dr);
4330 if (offset || byte_offset)
4331 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4334 if (dump_enabled_p ())
4336 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4337 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4338 dump_printf (MSG_NOTE, "\n");
4341 return addr_base;
4345 /* Function vect_create_data_ref_ptr.
4347 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4348 location accessed in the loop by STMT, along with the def-use update
4349 chain to appropriately advance the pointer through the loop iterations.
4350 Also set aliasing information for the pointer. This pointer is used by
4351 the callers to this function to create a memory reference expression for
4352 vector load/store access.
4354 Input:
4355 1. STMT: a stmt that references memory. Expected to be of the form
4356 GIMPLE_ASSIGN <name, data-ref> or
4357 GIMPLE_ASSIGN <data-ref, name>.
4358 2. AGGR_TYPE: the type of the reference, which should be either a vector
4359 or an array.
4360 3. AT_LOOP: the loop where the vector memref is to be created.
4361 4. OFFSET (optional): an offset to be added to the initial address accessed
4362 by the data-ref in STMT.
4363 5. BSI: location where the new stmts are to be placed if there is no loop
4364 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4365 pointing to the initial address.
4366 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4367 to the initial address accessed by the data-ref in STMT. This is
4368 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4369 in bytes.
4371 Output:
4372 1. Declare a new ptr to vector_type, and have it point to the base of the
4373 data reference (initial addressed accessed by the data reference).
4374 For example, for vector of type V8HI, the following code is generated:
4376 v8hi *ap;
4377 ap = (v8hi *)initial_address;
4379 if OFFSET is not supplied:
4380 initial_address = &a[init];
4381 if OFFSET is supplied:
4382 initial_address = &a[init + OFFSET];
4383 if BYTE_OFFSET is supplied:
4384 initial_address = &a[init] + BYTE_OFFSET;
4386 Return the initial_address in INITIAL_ADDRESS.
4388 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4389 update the pointer in each iteration of the loop.
4391 Return the increment stmt that updates the pointer in PTR_INCR.
4393 3. Set INV_P to true if the access pattern of the data reference in the
4394 vectorized loop is invariant. Set it to false otherwise.
4396 4. Return the pointer. */
4398 tree
4399 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4400 tree offset, tree *initial_address,
4401 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4402 bool only_init, bool *inv_p, tree byte_offset)
4404 const char *base_name;
4405 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4406 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4407 struct loop *loop = NULL;
4408 bool nested_in_vect_loop = false;
4409 struct loop *containing_loop = NULL;
4410 tree aggr_ptr_type;
4411 tree aggr_ptr;
4412 tree new_temp;
4413 gimple_seq new_stmt_list = NULL;
4414 edge pe = NULL;
4415 basic_block new_bb;
4416 tree aggr_ptr_init;
4417 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4418 tree aptr;
4419 gimple_stmt_iterator incr_gsi;
4420 bool insert_after;
4421 tree indx_before_incr, indx_after_incr;
4422 gimple *incr;
4423 tree step;
4424 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4426 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4427 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4429 if (loop_vinfo)
4431 loop = LOOP_VINFO_LOOP (loop_vinfo);
4432 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4433 containing_loop = (gimple_bb (stmt))->loop_father;
4434 pe = loop_preheader_edge (loop);
4436 else
4438 gcc_assert (bb_vinfo);
4439 only_init = true;
4440 *ptr_incr = NULL;
4443 /* Check the step (evolution) of the load in LOOP, and record
4444 whether it's invariant. */
4445 step = vect_dr_behavior (dr)->step;
4446 if (integer_zerop (step))
4447 *inv_p = true;
4448 else
4449 *inv_p = false;
4451 /* Create an expression for the first address accessed by this load
4452 in LOOP. */
4453 base_name = get_name (DR_BASE_ADDRESS (dr));
4455 if (dump_enabled_p ())
4457 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4458 dump_printf_loc (MSG_NOTE, vect_location,
4459 "create %s-pointer variable to type: ",
4460 get_tree_code_name (TREE_CODE (aggr_type)));
4461 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4462 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4463 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4464 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4465 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4466 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4467 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4468 else
4469 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4470 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4471 dump_printf (MSG_NOTE, "\n");
4474 /* (1) Create the new aggregate-pointer variable.
4475 Vector and array types inherit the alias set of their component
4476 type by default so we need to use a ref-all pointer if the data
4477 reference does not conflict with the created aggregated data
4478 reference because it is not addressable. */
4479 bool need_ref_all = false;
4480 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4481 get_alias_set (DR_REF (dr))))
4482 need_ref_all = true;
4483 /* Likewise for any of the data references in the stmt group. */
4484 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4486 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4489 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4490 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4491 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4492 get_alias_set (DR_REF (sdr))))
4494 need_ref_all = true;
4495 break;
4497 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4499 while (orig_stmt);
4501 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4502 need_ref_all);
4503 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4506 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4507 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4508 def-use update cycles for the pointer: one relative to the outer-loop
4509 (LOOP), which is what steps (3) and (4) below do. The other is relative
4510 to the inner-loop (which is the inner-most loop containing the dataref),
4511 and this is done be step (5) below.
4513 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4514 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4515 redundant. Steps (3),(4) create the following:
4517 vp0 = &base_addr;
4518 LOOP: vp1 = phi(vp0,vp2)
4521 vp2 = vp1 + step
4522 goto LOOP
4524 If there is an inner-loop nested in loop, then step (5) will also be
4525 applied, and an additional update in the inner-loop will be created:
4527 vp0 = &base_addr;
4528 LOOP: vp1 = phi(vp0,vp2)
4530 inner: vp3 = phi(vp1,vp4)
4531 vp4 = vp3 + inner_step
4532 if () goto inner
4534 vp2 = vp1 + step
4535 if () goto LOOP */
4537 /* (2) Calculate the initial address of the aggregate-pointer, and set
4538 the aggregate-pointer to point to it before the loop. */
4540 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4542 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4543 offset, byte_offset);
4544 if (new_stmt_list)
4546 if (pe)
4548 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4549 gcc_assert (!new_bb);
4551 else
4552 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4555 *initial_address = new_temp;
4556 aggr_ptr_init = new_temp;
4558 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4559 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4560 inner-loop nested in LOOP (during outer-loop vectorization). */
4562 /* No update in loop is required. */
4563 if (only_init && (!loop_vinfo || at_loop == loop))
4564 aptr = aggr_ptr_init;
4565 else
4567 /* The step of the aggregate pointer is the type size. */
4568 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4569 /* One exception to the above is when the scalar step of the load in
4570 LOOP is zero. In this case the step here is also zero. */
4571 if (*inv_p)
4572 iv_step = size_zero_node;
4573 else if (tree_int_cst_sgn (step) == -1)
4574 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4576 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4578 create_iv (aggr_ptr_init,
4579 fold_convert (aggr_ptr_type, iv_step),
4580 aggr_ptr, loop, &incr_gsi, insert_after,
4581 &indx_before_incr, &indx_after_incr);
4582 incr = gsi_stmt (incr_gsi);
4583 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4585 /* Copy the points-to information if it exists. */
4586 if (DR_PTR_INFO (dr))
4588 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4589 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4591 if (ptr_incr)
4592 *ptr_incr = incr;
4594 aptr = indx_before_incr;
4597 if (!nested_in_vect_loop || only_init)
4598 return aptr;
4601 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4602 nested in LOOP, if exists. */
4604 gcc_assert (nested_in_vect_loop);
4605 if (!only_init)
4607 standard_iv_increment_position (containing_loop, &incr_gsi,
4608 &insert_after);
4609 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4610 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4611 &indx_after_incr);
4612 incr = gsi_stmt (incr_gsi);
4613 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4615 /* Copy the points-to information if it exists. */
4616 if (DR_PTR_INFO (dr))
4618 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr);
4619 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr);
4621 if (ptr_incr)
4622 *ptr_incr = incr;
4624 return indx_before_incr;
4626 else
4627 gcc_unreachable ();
4631 /* Function bump_vector_ptr
4633 Increment a pointer (to a vector type) by vector-size. If requested,
4634 i.e. if PTR-INCR is given, then also connect the new increment stmt
4635 to the existing def-use update-chain of the pointer, by modifying
4636 the PTR_INCR as illustrated below:
4638 The pointer def-use update-chain before this function:
4639 DATAREF_PTR = phi (p_0, p_2)
4640 ....
4641 PTR_INCR: p_2 = DATAREF_PTR + step
4643 The pointer def-use update-chain after this function:
4644 DATAREF_PTR = phi (p_0, p_2)
4645 ....
4646 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4647 ....
4648 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4650 Input:
4651 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4652 in the loop.
4653 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4654 the loop. The increment amount across iterations is expected
4655 to be vector_size.
4656 BSI - location where the new update stmt is to be placed.
4657 STMT - the original scalar memory-access stmt that is being vectorized.
4658 BUMP - optional. The offset by which to bump the pointer. If not given,
4659 the offset is assumed to be vector_size.
4661 Output: Return NEW_DATAREF_PTR as illustrated above.
4665 tree
4666 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4667 gimple *stmt, tree bump)
4669 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4670 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4671 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4672 tree update = TYPE_SIZE_UNIT (vectype);
4673 gassign *incr_stmt;
4674 ssa_op_iter iter;
4675 use_operand_p use_p;
4676 tree new_dataref_ptr;
4678 if (bump)
4679 update = bump;
4681 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4682 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4683 else
4684 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4685 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4686 dataref_ptr, update);
4687 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4689 /* Copy the points-to information if it exists. */
4690 if (DR_PTR_INFO (dr))
4692 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4693 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4696 if (!ptr_incr)
4697 return new_dataref_ptr;
4699 /* Update the vector-pointer's cross-iteration increment. */
4700 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4702 tree use = USE_FROM_PTR (use_p);
4704 if (use == dataref_ptr)
4705 SET_USE (use_p, new_dataref_ptr);
4706 else
4707 gcc_assert (tree_int_cst_compare (use, update) == 0);
4710 return new_dataref_ptr;
4714 /* Function vect_create_destination_var.
4716 Create a new temporary of type VECTYPE. */
4718 tree
4719 vect_create_destination_var (tree scalar_dest, tree vectype)
4721 tree vec_dest;
4722 const char *name;
4723 char *new_name;
4724 tree type;
4725 enum vect_var_kind kind;
4727 kind = vectype
4728 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4729 ? vect_mask_var
4730 : vect_simple_var
4731 : vect_scalar_var;
4732 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4734 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4736 name = get_name (scalar_dest);
4737 if (name)
4738 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4739 else
4740 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4741 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4742 free (new_name);
4744 return vec_dest;
4747 /* Function vect_grouped_store_supported.
4749 Returns TRUE if interleave high and interleave low permutations
4750 are supported, and FALSE otherwise. */
4752 bool
4753 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4755 machine_mode mode = TYPE_MODE (vectype);
4757 /* vect_permute_store_chain requires the group size to be equal to 3 or
4758 be a power of two. */
4759 if (count != 3 && exact_log2 (count) == -1)
4761 if (dump_enabled_p ())
4762 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4763 "the size of the group of accesses"
4764 " is not a power of 2 or not eqaul to 3\n");
4765 return false;
4768 /* Check that the permutation is supported. */
4769 if (VECTOR_MODE_P (mode))
4771 unsigned int i;
4772 if (count == 3)
4774 unsigned int j0 = 0, j1 = 0, j2 = 0;
4775 unsigned int i, j;
4777 unsigned int nelt;
4778 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4780 if (dump_enabled_p ())
4781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4782 "cannot handle groups of 3 stores for"
4783 " variable-length vectors\n");
4784 return false;
4787 vec_perm_builder sel (nelt, nelt, 1);
4788 sel.quick_grow (nelt);
4789 vec_perm_indices indices;
4790 for (j = 0; j < 3; j++)
4792 int nelt0 = ((3 - j) * nelt) % 3;
4793 int nelt1 = ((3 - j) * nelt + 1) % 3;
4794 int nelt2 = ((3 - j) * nelt + 2) % 3;
4795 for (i = 0; i < nelt; i++)
4797 if (3 * i + nelt0 < nelt)
4798 sel[3 * i + nelt0] = j0++;
4799 if (3 * i + nelt1 < nelt)
4800 sel[3 * i + nelt1] = nelt + j1++;
4801 if (3 * i + nelt2 < nelt)
4802 sel[3 * i + nelt2] = 0;
4804 indices.new_vector (sel, 2, nelt);
4805 if (!can_vec_perm_const_p (mode, indices))
4807 if (dump_enabled_p ())
4808 dump_printf (MSG_MISSED_OPTIMIZATION,
4809 "permutation op not supported by target.\n");
4810 return false;
4813 for (i = 0; i < nelt; i++)
4815 if (3 * i + nelt0 < nelt)
4816 sel[3 * i + nelt0] = 3 * i + nelt0;
4817 if (3 * i + nelt1 < nelt)
4818 sel[3 * i + nelt1] = 3 * i + nelt1;
4819 if (3 * i + nelt2 < nelt)
4820 sel[3 * i + nelt2] = nelt + j2++;
4822 indices.new_vector (sel, 2, nelt);
4823 if (!can_vec_perm_const_p (mode, indices))
4825 if (dump_enabled_p ())
4826 dump_printf (MSG_MISSED_OPTIMIZATION,
4827 "permutation op not supported by target.\n");
4828 return false;
4831 return true;
4833 else
4835 /* If length is not equal to 3 then only power of 2 is supported. */
4836 gcc_assert (pow2p_hwi (count));
4837 poly_uint64 nelt = GET_MODE_NUNITS (mode);
4839 /* The encoding has 2 interleaved stepped patterns. */
4840 vec_perm_builder sel (nelt, 2, 3);
4841 sel.quick_grow (6);
4842 for (i = 0; i < 3; i++)
4844 sel[i * 2] = i;
4845 sel[i * 2 + 1] = i + nelt;
4847 vec_perm_indices indices (sel, 2, nelt);
4848 if (can_vec_perm_const_p (mode, indices))
4850 for (i = 0; i < 6; i++)
4851 sel[i] += exact_div (nelt, 2);
4852 indices.new_vector (sel, 2, nelt);
4853 if (can_vec_perm_const_p (mode, indices))
4854 return true;
4859 if (dump_enabled_p ())
4860 dump_printf (MSG_MISSED_OPTIMIZATION,
4861 "permutaion op not supported by target.\n");
4862 return false;
4866 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
4867 type VECTYPE. MASKED_P says whether the masked form is needed. */
4869 bool
4870 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
4871 bool masked_p)
4873 if (masked_p)
4874 return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
4875 vec_mask_store_lanes_optab,
4876 vectype, count);
4877 else
4878 return vect_lanes_optab_supported_p ("vec_store_lanes",
4879 vec_store_lanes_optab,
4880 vectype, count);
4884 /* Function vect_permute_store_chain.
4886 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4887 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4888 the data correctly for the stores. Return the final references for stores
4889 in RESULT_CHAIN.
4891 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4892 The input is 4 vectors each containing 8 elements. We assign a number to
4893 each element, the input sequence is:
4895 1st vec: 0 1 2 3 4 5 6 7
4896 2nd vec: 8 9 10 11 12 13 14 15
4897 3rd vec: 16 17 18 19 20 21 22 23
4898 4th vec: 24 25 26 27 28 29 30 31
4900 The output sequence should be:
4902 1st vec: 0 8 16 24 1 9 17 25
4903 2nd vec: 2 10 18 26 3 11 19 27
4904 3rd vec: 4 12 20 28 5 13 21 30
4905 4th vec: 6 14 22 30 7 15 23 31
4907 i.e., we interleave the contents of the four vectors in their order.
4909 We use interleave_high/low instructions to create such output. The input of
4910 each interleave_high/low operation is two vectors:
4911 1st vec 2nd vec
4912 0 1 2 3 4 5 6 7
4913 the even elements of the result vector are obtained left-to-right from the
4914 high/low elements of the first vector. The odd elements of the result are
4915 obtained left-to-right from the high/low elements of the second vector.
4916 The output of interleave_high will be: 0 4 1 5
4917 and of interleave_low: 2 6 3 7
4920 The permutation is done in log LENGTH stages. In each stage interleave_high
4921 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4922 where the first argument is taken from the first half of DR_CHAIN and the
4923 second argument from it's second half.
4924 In our example,
4926 I1: interleave_high (1st vec, 3rd vec)
4927 I2: interleave_low (1st vec, 3rd vec)
4928 I3: interleave_high (2nd vec, 4th vec)
4929 I4: interleave_low (2nd vec, 4th vec)
4931 The output for the first stage is:
4933 I1: 0 16 1 17 2 18 3 19
4934 I2: 4 20 5 21 6 22 7 23
4935 I3: 8 24 9 25 10 26 11 27
4936 I4: 12 28 13 29 14 30 15 31
4938 The output of the second stage, i.e. the final result is:
4940 I1: 0 8 16 24 1 9 17 25
4941 I2: 2 10 18 26 3 11 19 27
4942 I3: 4 12 20 28 5 13 21 30
4943 I4: 6 14 22 30 7 15 23 31. */
4945 void
4946 vect_permute_store_chain (vec<tree> dr_chain,
4947 unsigned int length,
4948 gimple *stmt,
4949 gimple_stmt_iterator *gsi,
4950 vec<tree> *result_chain)
4952 tree vect1, vect2, high, low;
4953 gimple *perm_stmt;
4954 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4955 tree perm_mask_low, perm_mask_high;
4956 tree data_ref;
4957 tree perm3_mask_low, perm3_mask_high;
4958 unsigned int i, j, n, log_length = exact_log2 (length);
4960 result_chain->quick_grow (length);
4961 memcpy (result_chain->address (), dr_chain.address (),
4962 length * sizeof (tree));
4964 if (length == 3)
4966 /* vect_grouped_store_supported ensures that this is constant. */
4967 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4968 unsigned int j0 = 0, j1 = 0, j2 = 0;
4970 vec_perm_builder sel (nelt, nelt, 1);
4971 sel.quick_grow (nelt);
4972 vec_perm_indices indices;
4973 for (j = 0; j < 3; j++)
4975 int nelt0 = ((3 - j) * nelt) % 3;
4976 int nelt1 = ((3 - j) * nelt + 1) % 3;
4977 int nelt2 = ((3 - j) * nelt + 2) % 3;
4979 for (i = 0; i < nelt; i++)
4981 if (3 * i + nelt0 < nelt)
4982 sel[3 * i + nelt0] = j0++;
4983 if (3 * i + nelt1 < nelt)
4984 sel[3 * i + nelt1] = nelt + j1++;
4985 if (3 * i + nelt2 < nelt)
4986 sel[3 * i + nelt2] = 0;
4988 indices.new_vector (sel, 2, nelt);
4989 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
4991 for (i = 0; i < nelt; i++)
4993 if (3 * i + nelt0 < nelt)
4994 sel[3 * i + nelt0] = 3 * i + nelt0;
4995 if (3 * i + nelt1 < nelt)
4996 sel[3 * i + nelt1] = 3 * i + nelt1;
4997 if (3 * i + nelt2 < nelt)
4998 sel[3 * i + nelt2] = nelt + j2++;
5000 indices.new_vector (sel, 2, nelt);
5001 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5003 vect1 = dr_chain[0];
5004 vect2 = dr_chain[1];
5006 /* Create interleaving stmt:
5007 low = VEC_PERM_EXPR <vect1, vect2,
5008 {j, nelt, *, j + 1, nelt + j + 1, *,
5009 j + 2, nelt + j + 2, *, ...}> */
5010 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5011 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5012 vect2, perm3_mask_low);
5013 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5015 vect1 = data_ref;
5016 vect2 = dr_chain[2];
5017 /* Create interleaving stmt:
5018 low = VEC_PERM_EXPR <vect1, vect2,
5019 {0, 1, nelt + j, 3, 4, nelt + j + 1,
5020 6, 7, nelt + j + 2, ...}> */
5021 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5022 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5023 vect2, perm3_mask_high);
5024 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5025 (*result_chain)[j] = data_ref;
5028 else
5030 /* If length is not equal to 3 then only power of 2 is supported. */
5031 gcc_assert (pow2p_hwi (length));
5033 /* The encoding has 2 interleaved stepped patterns. */
5034 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5035 vec_perm_builder sel (nelt, 2, 3);
5036 sel.quick_grow (6);
5037 for (i = 0; i < 3; i++)
5039 sel[i * 2] = i;
5040 sel[i * 2 + 1] = i + nelt;
5042 vec_perm_indices indices (sel, 2, nelt);
5043 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5045 for (i = 0; i < 6; i++)
5046 sel[i] += exact_div (nelt, 2);
5047 indices.new_vector (sel, 2, nelt);
5048 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5050 for (i = 0, n = log_length; i < n; i++)
5052 for (j = 0; j < length/2; j++)
5054 vect1 = dr_chain[j];
5055 vect2 = dr_chain[j+length/2];
5057 /* Create interleaving stmt:
5058 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5059 ...}> */
5060 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5061 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5062 vect2, perm_mask_high);
5063 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5064 (*result_chain)[2*j] = high;
5066 /* Create interleaving stmt:
5067 low = VEC_PERM_EXPR <vect1, vect2,
5068 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5069 ...}> */
5070 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5071 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5072 vect2, perm_mask_low);
5073 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5074 (*result_chain)[2*j+1] = low;
5076 memcpy (dr_chain.address (), result_chain->address (),
5077 length * sizeof (tree));
5082 /* Function vect_setup_realignment
5084 This function is called when vectorizing an unaligned load using
5085 the dr_explicit_realign[_optimized] scheme.
5086 This function generates the following code at the loop prolog:
5088 p = initial_addr;
5089 x msq_init = *(floor(p)); # prolog load
5090 realignment_token = call target_builtin;
5091 loop:
5092 x msq = phi (msq_init, ---)
5094 The stmts marked with x are generated only for the case of
5095 dr_explicit_realign_optimized.
5097 The code above sets up a new (vector) pointer, pointing to the first
5098 location accessed by STMT, and a "floor-aligned" load using that pointer.
5099 It also generates code to compute the "realignment-token" (if the relevant
5100 target hook was defined), and creates a phi-node at the loop-header bb
5101 whose arguments are the result of the prolog-load (created by this
5102 function) and the result of a load that takes place in the loop (to be
5103 created by the caller to this function).
5105 For the case of dr_explicit_realign_optimized:
5106 The caller to this function uses the phi-result (msq) to create the
5107 realignment code inside the loop, and sets up the missing phi argument,
5108 as follows:
5109 loop:
5110 msq = phi (msq_init, lsq)
5111 lsq = *(floor(p')); # load in loop
5112 result = realign_load (msq, lsq, realignment_token);
5114 For the case of dr_explicit_realign:
5115 loop:
5116 msq = *(floor(p)); # load in loop
5117 p' = p + (VS-1);
5118 lsq = *(floor(p')); # load in loop
5119 result = realign_load (msq, lsq, realignment_token);
5121 Input:
5122 STMT - (scalar) load stmt to be vectorized. This load accesses
5123 a memory location that may be unaligned.
5124 BSI - place where new code is to be inserted.
5125 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5126 is used.
5128 Output:
5129 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5130 target hook, if defined.
5131 Return value - the result of the loop-header phi node. */
5133 tree
5134 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5135 tree *realignment_token,
5136 enum dr_alignment_support alignment_support_scheme,
5137 tree init_addr,
5138 struct loop **at_loop)
5140 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5141 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5142 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5143 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5144 struct loop *loop = NULL;
5145 edge pe = NULL;
5146 tree scalar_dest = gimple_assign_lhs (stmt);
5147 tree vec_dest;
5148 gimple *inc;
5149 tree ptr;
5150 tree data_ref;
5151 basic_block new_bb;
5152 tree msq_init = NULL_TREE;
5153 tree new_temp;
5154 gphi *phi_stmt;
5155 tree msq = NULL_TREE;
5156 gimple_seq stmts = NULL;
5157 bool inv_p;
5158 bool compute_in_loop = false;
5159 bool nested_in_vect_loop = false;
5160 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5161 struct loop *loop_for_initial_load = NULL;
5163 if (loop_vinfo)
5165 loop = LOOP_VINFO_LOOP (loop_vinfo);
5166 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5169 gcc_assert (alignment_support_scheme == dr_explicit_realign
5170 || alignment_support_scheme == dr_explicit_realign_optimized);
5172 /* We need to generate three things:
5173 1. the misalignment computation
5174 2. the extra vector load (for the optimized realignment scheme).
5175 3. the phi node for the two vectors from which the realignment is
5176 done (for the optimized realignment scheme). */
5178 /* 1. Determine where to generate the misalignment computation.
5180 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5181 calculation will be generated by this function, outside the loop (in the
5182 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5183 caller, inside the loop.
5185 Background: If the misalignment remains fixed throughout the iterations of
5186 the loop, then both realignment schemes are applicable, and also the
5187 misalignment computation can be done outside LOOP. This is because we are
5188 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5189 are a multiple of VS (the Vector Size), and therefore the misalignment in
5190 different vectorized LOOP iterations is always the same.
5191 The problem arises only if the memory access is in an inner-loop nested
5192 inside LOOP, which is now being vectorized using outer-loop vectorization.
5193 This is the only case when the misalignment of the memory access may not
5194 remain fixed throughout the iterations of the inner-loop (as explained in
5195 detail in vect_supportable_dr_alignment). In this case, not only is the
5196 optimized realignment scheme not applicable, but also the misalignment
5197 computation (and generation of the realignment token that is passed to
5198 REALIGN_LOAD) have to be done inside the loop.
5200 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5201 or not, which in turn determines if the misalignment is computed inside
5202 the inner-loop, or outside LOOP. */
5204 if (init_addr != NULL_TREE || !loop_vinfo)
5206 compute_in_loop = true;
5207 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5211 /* 2. Determine where to generate the extra vector load.
5213 For the optimized realignment scheme, instead of generating two vector
5214 loads in each iteration, we generate a single extra vector load in the
5215 preheader of the loop, and in each iteration reuse the result of the
5216 vector load from the previous iteration. In case the memory access is in
5217 an inner-loop nested inside LOOP, which is now being vectorized using
5218 outer-loop vectorization, we need to determine whether this initial vector
5219 load should be generated at the preheader of the inner-loop, or can be
5220 generated at the preheader of LOOP. If the memory access has no evolution
5221 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5222 to be generated inside LOOP (in the preheader of the inner-loop). */
5224 if (nested_in_vect_loop)
5226 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5227 bool invariant_in_outerloop =
5228 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5229 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5231 else
5232 loop_for_initial_load = loop;
5233 if (at_loop)
5234 *at_loop = loop_for_initial_load;
5236 if (loop_for_initial_load)
5237 pe = loop_preheader_edge (loop_for_initial_load);
5239 /* 3. For the case of the optimized realignment, create the first vector
5240 load at the loop preheader. */
5242 if (alignment_support_scheme == dr_explicit_realign_optimized)
5244 /* Create msq_init = *(floor(p1)) in the loop preheader */
5245 gassign *new_stmt;
5247 gcc_assert (!compute_in_loop);
5248 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5249 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5250 NULL_TREE, &init_addr, NULL, &inc,
5251 true, &inv_p);
5252 if (TREE_CODE (ptr) == SSA_NAME)
5253 new_temp = copy_ssa_name (ptr);
5254 else
5255 new_temp = make_ssa_name (TREE_TYPE (ptr));
5256 unsigned int align = DR_TARGET_ALIGNMENT (dr);
5257 new_stmt = gimple_build_assign
5258 (new_temp, BIT_AND_EXPR, ptr,
5259 build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5260 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5261 gcc_assert (!new_bb);
5262 data_ref
5263 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5264 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5265 new_stmt = gimple_build_assign (vec_dest, data_ref);
5266 new_temp = make_ssa_name (vec_dest, new_stmt);
5267 gimple_assign_set_lhs (new_stmt, new_temp);
5268 if (pe)
5270 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5271 gcc_assert (!new_bb);
5273 else
5274 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5276 msq_init = gimple_assign_lhs (new_stmt);
5279 /* 4. Create realignment token using a target builtin, if available.
5280 It is done either inside the containing loop, or before LOOP (as
5281 determined above). */
5283 if (targetm.vectorize.builtin_mask_for_load)
5285 gcall *new_stmt;
5286 tree builtin_decl;
5288 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5289 if (!init_addr)
5291 /* Generate the INIT_ADDR computation outside LOOP. */
5292 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5293 NULL_TREE);
5294 if (loop)
5296 pe = loop_preheader_edge (loop);
5297 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5298 gcc_assert (!new_bb);
5300 else
5301 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5304 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5305 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5306 vec_dest =
5307 vect_create_destination_var (scalar_dest,
5308 gimple_call_return_type (new_stmt));
5309 new_temp = make_ssa_name (vec_dest, new_stmt);
5310 gimple_call_set_lhs (new_stmt, new_temp);
5312 if (compute_in_loop)
5313 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5314 else
5316 /* Generate the misalignment computation outside LOOP. */
5317 pe = loop_preheader_edge (loop);
5318 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5319 gcc_assert (!new_bb);
5322 *realignment_token = gimple_call_lhs (new_stmt);
5324 /* The result of the CALL_EXPR to this builtin is determined from
5325 the value of the parameter and no global variables are touched
5326 which makes the builtin a "const" function. Requiring the
5327 builtin to have the "const" attribute makes it unnecessary
5328 to call mark_call_clobbered. */
5329 gcc_assert (TREE_READONLY (builtin_decl));
5332 if (alignment_support_scheme == dr_explicit_realign)
5333 return msq;
5335 gcc_assert (!compute_in_loop);
5336 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5339 /* 5. Create msq = phi <msq_init, lsq> in loop */
5341 pe = loop_preheader_edge (containing_loop);
5342 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5343 msq = make_ssa_name (vec_dest);
5344 phi_stmt = create_phi_node (msq, containing_loop->header);
5345 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5347 return msq;
5351 /* Function vect_grouped_load_supported.
5353 COUNT is the size of the load group (the number of statements plus the
5354 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5355 only one statement, with a gap of COUNT - 1.
5357 Returns true if a suitable permute exists. */
5359 bool
5360 vect_grouped_load_supported (tree vectype, bool single_element_p,
5361 unsigned HOST_WIDE_INT count)
5363 machine_mode mode = TYPE_MODE (vectype);
5365 /* If this is single-element interleaving with an element distance
5366 that leaves unused vector loads around punt - we at least create
5367 very sub-optimal code in that case (and blow up memory,
5368 see PR65518). */
5369 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5371 if (dump_enabled_p ())
5372 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5373 "single-element interleaving not supported "
5374 "for not adjacent vector loads\n");
5375 return false;
5378 /* vect_permute_load_chain requires the group size to be equal to 3 or
5379 be a power of two. */
5380 if (count != 3 && exact_log2 (count) == -1)
5382 if (dump_enabled_p ())
5383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5384 "the size of the group of accesses"
5385 " is not a power of 2 or not equal to 3\n");
5386 return false;
5389 /* Check that the permutation is supported. */
5390 if (VECTOR_MODE_P (mode))
5392 unsigned int i, j;
5393 if (count == 3)
5395 unsigned int nelt;
5396 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5398 if (dump_enabled_p ())
5399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5400 "cannot handle groups of 3 loads for"
5401 " variable-length vectors\n");
5402 return false;
5405 vec_perm_builder sel (nelt, nelt, 1);
5406 sel.quick_grow (nelt);
5407 vec_perm_indices indices;
5408 unsigned int k;
5409 for (k = 0; k < 3; k++)
5411 for (i = 0; i < nelt; i++)
5412 if (3 * i + k < 2 * nelt)
5413 sel[i] = 3 * i + k;
5414 else
5415 sel[i] = 0;
5416 indices.new_vector (sel, 2, nelt);
5417 if (!can_vec_perm_const_p (mode, indices))
5419 if (dump_enabled_p ())
5420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5421 "shuffle of 3 loads is not supported by"
5422 " target\n");
5423 return false;
5425 for (i = 0, j = 0; i < nelt; i++)
5426 if (3 * i + k < 2 * nelt)
5427 sel[i] = i;
5428 else
5429 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5430 indices.new_vector (sel, 2, nelt);
5431 if (!can_vec_perm_const_p (mode, indices))
5433 if (dump_enabled_p ())
5434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5435 "shuffle of 3 loads is not supported by"
5436 " target\n");
5437 return false;
5440 return true;
5442 else
5444 /* If length is not equal to 3 then only power of 2 is supported. */
5445 gcc_assert (pow2p_hwi (count));
5446 poly_uint64 nelt = GET_MODE_NUNITS (mode);
5448 /* The encoding has a single stepped pattern. */
5449 vec_perm_builder sel (nelt, 1, 3);
5450 sel.quick_grow (3);
5451 for (i = 0; i < 3; i++)
5452 sel[i] = i * 2;
5453 vec_perm_indices indices (sel, 2, nelt);
5454 if (can_vec_perm_const_p (mode, indices))
5456 for (i = 0; i < 3; i++)
5457 sel[i] = i * 2 + 1;
5458 indices.new_vector (sel, 2, nelt);
5459 if (can_vec_perm_const_p (mode, indices))
5460 return true;
5465 if (dump_enabled_p ())
5466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5467 "extract even/odd not supported by target\n");
5468 return false;
5471 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5472 type VECTYPE. MASKED_P says whether the masked form is needed. */
5474 bool
5475 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5476 bool masked_p)
5478 if (masked_p)
5479 return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5480 vec_mask_load_lanes_optab,
5481 vectype, count);
5482 else
5483 return vect_lanes_optab_supported_p ("vec_load_lanes",
5484 vec_load_lanes_optab,
5485 vectype, count);
5488 /* Function vect_permute_load_chain.
5490 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5491 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5492 the input data correctly. Return the final references for loads in
5493 RESULT_CHAIN.
5495 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5496 The input is 4 vectors each containing 8 elements. We assign a number to each
5497 element, the input sequence is:
5499 1st vec: 0 1 2 3 4 5 6 7
5500 2nd vec: 8 9 10 11 12 13 14 15
5501 3rd vec: 16 17 18 19 20 21 22 23
5502 4th vec: 24 25 26 27 28 29 30 31
5504 The output sequence should be:
5506 1st vec: 0 4 8 12 16 20 24 28
5507 2nd vec: 1 5 9 13 17 21 25 29
5508 3rd vec: 2 6 10 14 18 22 26 30
5509 4th vec: 3 7 11 15 19 23 27 31
5511 i.e., the first output vector should contain the first elements of each
5512 interleaving group, etc.
5514 We use extract_even/odd instructions to create such output. The input of
5515 each extract_even/odd operation is two vectors
5516 1st vec 2nd vec
5517 0 1 2 3 4 5 6 7
5519 and the output is the vector of extracted even/odd elements. The output of
5520 extract_even will be: 0 2 4 6
5521 and of extract_odd: 1 3 5 7
5524 The permutation is done in log LENGTH stages. In each stage extract_even
5525 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5526 their order. In our example,
5528 E1: extract_even (1st vec, 2nd vec)
5529 E2: extract_odd (1st vec, 2nd vec)
5530 E3: extract_even (3rd vec, 4th vec)
5531 E4: extract_odd (3rd vec, 4th vec)
5533 The output for the first stage will be:
5535 E1: 0 2 4 6 8 10 12 14
5536 E2: 1 3 5 7 9 11 13 15
5537 E3: 16 18 20 22 24 26 28 30
5538 E4: 17 19 21 23 25 27 29 31
5540 In order to proceed and create the correct sequence for the next stage (or
5541 for the correct output, if the second stage is the last one, as in our
5542 example), we first put the output of extract_even operation and then the
5543 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5544 The input for the second stage is:
5546 1st vec (E1): 0 2 4 6 8 10 12 14
5547 2nd vec (E3): 16 18 20 22 24 26 28 30
5548 3rd vec (E2): 1 3 5 7 9 11 13 15
5549 4th vec (E4): 17 19 21 23 25 27 29 31
5551 The output of the second stage:
5553 E1: 0 4 8 12 16 20 24 28
5554 E2: 2 6 10 14 18 22 26 30
5555 E3: 1 5 9 13 17 21 25 29
5556 E4: 3 7 11 15 19 23 27 31
5558 And RESULT_CHAIN after reordering:
5560 1st vec (E1): 0 4 8 12 16 20 24 28
5561 2nd vec (E3): 1 5 9 13 17 21 25 29
5562 3rd vec (E2): 2 6 10 14 18 22 26 30
5563 4th vec (E4): 3 7 11 15 19 23 27 31. */
5565 static void
5566 vect_permute_load_chain (vec<tree> dr_chain,
5567 unsigned int length,
5568 gimple *stmt,
5569 gimple_stmt_iterator *gsi,
5570 vec<tree> *result_chain)
5572 tree data_ref, first_vect, second_vect;
5573 tree perm_mask_even, perm_mask_odd;
5574 tree perm3_mask_low, perm3_mask_high;
5575 gimple *perm_stmt;
5576 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5577 unsigned int i, j, log_length = exact_log2 (length);
5579 result_chain->quick_grow (length);
5580 memcpy (result_chain->address (), dr_chain.address (),
5581 length * sizeof (tree));
5583 if (length == 3)
5585 /* vect_grouped_load_supported ensures that this is constant. */
5586 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5587 unsigned int k;
5589 vec_perm_builder sel (nelt, nelt, 1);
5590 sel.quick_grow (nelt);
5591 vec_perm_indices indices;
5592 for (k = 0; k < 3; k++)
5594 for (i = 0; i < nelt; i++)
5595 if (3 * i + k < 2 * nelt)
5596 sel[i] = 3 * i + k;
5597 else
5598 sel[i] = 0;
5599 indices.new_vector (sel, 2, nelt);
5600 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5602 for (i = 0, j = 0; i < nelt; i++)
5603 if (3 * i + k < 2 * nelt)
5604 sel[i] = i;
5605 else
5606 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5607 indices.new_vector (sel, 2, nelt);
5608 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5610 first_vect = dr_chain[0];
5611 second_vect = dr_chain[1];
5613 /* Create interleaving stmt (low part of):
5614 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5615 ...}> */
5616 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5617 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5618 second_vect, perm3_mask_low);
5619 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5621 /* Create interleaving stmt (high part of):
5622 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5623 ...}> */
5624 first_vect = data_ref;
5625 second_vect = dr_chain[2];
5626 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5627 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5628 second_vect, perm3_mask_high);
5629 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5630 (*result_chain)[k] = data_ref;
5633 else
5635 /* If length is not equal to 3 then only power of 2 is supported. */
5636 gcc_assert (pow2p_hwi (length));
5638 /* The encoding has a single stepped pattern. */
5639 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5640 vec_perm_builder sel (nelt, 1, 3);
5641 sel.quick_grow (3);
5642 for (i = 0; i < 3; ++i)
5643 sel[i] = i * 2;
5644 vec_perm_indices indices (sel, 2, nelt);
5645 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5647 for (i = 0; i < 3; ++i)
5648 sel[i] = i * 2 + 1;
5649 indices.new_vector (sel, 2, nelt);
5650 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5652 for (i = 0; i < log_length; i++)
5654 for (j = 0; j < length; j += 2)
5656 first_vect = dr_chain[j];
5657 second_vect = dr_chain[j+1];
5659 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5660 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5661 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5662 first_vect, second_vect,
5663 perm_mask_even);
5664 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5665 (*result_chain)[j/2] = data_ref;
5667 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5668 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5669 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5670 first_vect, second_vect,
5671 perm_mask_odd);
5672 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5673 (*result_chain)[j/2+length/2] = data_ref;
5675 memcpy (dr_chain.address (), result_chain->address (),
5676 length * sizeof (tree));
5681 /* Function vect_shift_permute_load_chain.
5683 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5684 sequence of stmts to reorder the input data accordingly.
5685 Return the final references for loads in RESULT_CHAIN.
5686 Return true if successed, false otherwise.
5688 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5689 The input is 3 vectors each containing 8 elements. We assign a
5690 number to each element, the input sequence is:
5692 1st vec: 0 1 2 3 4 5 6 7
5693 2nd vec: 8 9 10 11 12 13 14 15
5694 3rd vec: 16 17 18 19 20 21 22 23
5696 The output sequence should be:
5698 1st vec: 0 3 6 9 12 15 18 21
5699 2nd vec: 1 4 7 10 13 16 19 22
5700 3rd vec: 2 5 8 11 14 17 20 23
5702 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5704 First we shuffle all 3 vectors to get correct elements order:
5706 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5707 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5708 3rd vec: (16 19 22) (17 20 23) (18 21)
5710 Next we unite and shift vector 3 times:
5712 1st step:
5713 shift right by 6 the concatenation of:
5714 "1st vec" and "2nd vec"
5715 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5716 "2nd vec" and "3rd vec"
5717 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5718 "3rd vec" and "1st vec"
5719 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5720 | New vectors |
5722 So that now new vectors are:
5724 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5725 2nd vec: (10 13) (16 19 22) (17 20 23)
5726 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5728 2nd step:
5729 shift right by 5 the concatenation of:
5730 "1st vec" and "3rd vec"
5731 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5732 "2nd vec" and "1st vec"
5733 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5734 "3rd vec" and "2nd vec"
5735 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5736 | New vectors |
5738 So that now new vectors are:
5740 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5741 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5742 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5744 3rd step:
5745 shift right by 5 the concatenation of:
5746 "1st vec" and "1st vec"
5747 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5748 shift right by 3 the concatenation of:
5749 "2nd vec" and "2nd vec"
5750 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5751 | New vectors |
5753 So that now all vectors are READY:
5754 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5755 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5756 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5758 This algorithm is faster than one in vect_permute_load_chain if:
5759 1. "shift of a concatination" is faster than general permutation.
5760 This is usually so.
5761 2. The TARGET machine can't execute vector instructions in parallel.
5762 This is because each step of the algorithm depends on previous.
5763 The algorithm in vect_permute_load_chain is much more parallel.
5765 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5768 static bool
5769 vect_shift_permute_load_chain (vec<tree> dr_chain,
5770 unsigned int length,
5771 gimple *stmt,
5772 gimple_stmt_iterator *gsi,
5773 vec<tree> *result_chain)
5775 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5776 tree perm2_mask1, perm2_mask2, perm3_mask;
5777 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5778 gimple *perm_stmt;
5780 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5781 unsigned int i;
5782 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5783 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5785 unsigned HOST_WIDE_INT nelt, vf;
5786 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
5787 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
5788 /* Not supported for variable-length vectors. */
5789 return false;
5791 vec_perm_builder sel (nelt, nelt, 1);
5792 sel.quick_grow (nelt);
5794 result_chain->quick_grow (length);
5795 memcpy (result_chain->address (), dr_chain.address (),
5796 length * sizeof (tree));
5798 if (pow2p_hwi (length) && vf > 4)
5800 unsigned int j, log_length = exact_log2 (length);
5801 for (i = 0; i < nelt / 2; ++i)
5802 sel[i] = i * 2;
5803 for (i = 0; i < nelt / 2; ++i)
5804 sel[nelt / 2 + i] = i * 2 + 1;
5805 vec_perm_indices indices (sel, 2, nelt);
5806 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5808 if (dump_enabled_p ())
5809 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5810 "shuffle of 2 fields structure is not \
5811 supported by target\n");
5812 return false;
5814 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
5816 for (i = 0; i < nelt / 2; ++i)
5817 sel[i] = i * 2 + 1;
5818 for (i = 0; i < nelt / 2; ++i)
5819 sel[nelt / 2 + i] = i * 2;
5820 indices.new_vector (sel, 2, nelt);
5821 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5823 if (dump_enabled_p ())
5824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5825 "shuffle of 2 fields structure is not \
5826 supported by target\n");
5827 return false;
5829 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
5831 /* Generating permutation constant to shift all elements.
5832 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5833 for (i = 0; i < nelt; i++)
5834 sel[i] = nelt / 2 + i;
5835 indices.new_vector (sel, 2, nelt);
5836 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5838 if (dump_enabled_p ())
5839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5840 "shift permutation is not supported by target\n");
5841 return false;
5843 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5845 /* Generating permutation constant to select vector from 2.
5846 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5847 for (i = 0; i < nelt / 2; i++)
5848 sel[i] = i;
5849 for (i = nelt / 2; i < nelt; i++)
5850 sel[i] = nelt + i;
5851 indices.new_vector (sel, 2, nelt);
5852 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5854 if (dump_enabled_p ())
5855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5856 "select is not supported by target\n");
5857 return false;
5859 select_mask = vect_gen_perm_mask_checked (vectype, indices);
5861 for (i = 0; i < log_length; i++)
5863 for (j = 0; j < length; j += 2)
5865 first_vect = dr_chain[j];
5866 second_vect = dr_chain[j + 1];
5868 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5869 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5870 first_vect, first_vect,
5871 perm2_mask1);
5872 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5873 vect[0] = data_ref;
5875 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5876 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5877 second_vect, second_vect,
5878 perm2_mask2);
5879 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5880 vect[1] = data_ref;
5882 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5883 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5884 vect[0], vect[1], shift1_mask);
5885 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5886 (*result_chain)[j/2 + length/2] = data_ref;
5888 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5889 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5890 vect[0], vect[1], select_mask);
5891 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5892 (*result_chain)[j/2] = data_ref;
5894 memcpy (dr_chain.address (), result_chain->address (),
5895 length * sizeof (tree));
5897 return true;
5899 if (length == 3 && vf > 2)
5901 unsigned int k = 0, l = 0;
5903 /* Generating permutation constant to get all elements in rigth order.
5904 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5905 for (i = 0; i < nelt; i++)
5907 if (3 * k + (l % 3) >= nelt)
5909 k = 0;
5910 l += (3 - (nelt % 3));
5912 sel[i] = 3 * k + (l % 3);
5913 k++;
5915 vec_perm_indices indices (sel, 2, nelt);
5916 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5918 if (dump_enabled_p ())
5919 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5920 "shuffle of 3 fields structure is not \
5921 supported by target\n");
5922 return false;
5924 perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
5926 /* Generating permutation constant to shift all elements.
5927 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5928 for (i = 0; i < nelt; i++)
5929 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5930 indices.new_vector (sel, 2, nelt);
5931 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5933 if (dump_enabled_p ())
5934 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5935 "shift permutation is not supported by target\n");
5936 return false;
5938 shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
5940 /* Generating permutation constant to shift all elements.
5941 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5942 for (i = 0; i < nelt; i++)
5943 sel[i] = 2 * (nelt / 3) + 1 + i;
5944 indices.new_vector (sel, 2, nelt);
5945 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5947 if (dump_enabled_p ())
5948 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5949 "shift permutation is not supported by target\n");
5950 return false;
5952 shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
5954 /* Generating permutation constant to shift all elements.
5955 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5956 for (i = 0; i < nelt; i++)
5957 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5958 indices.new_vector (sel, 2, nelt);
5959 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5961 if (dump_enabled_p ())
5962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5963 "shift permutation is not supported by target\n");
5964 return false;
5966 shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
5968 /* Generating permutation constant to shift all elements.
5969 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5970 for (i = 0; i < nelt; i++)
5971 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5972 indices.new_vector (sel, 2, nelt);
5973 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
5975 if (dump_enabled_p ())
5976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5977 "shift permutation is not supported by target\n");
5978 return false;
5980 shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
5982 for (k = 0; k < 3; k++)
5984 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5985 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5986 dr_chain[k], dr_chain[k],
5987 perm3_mask);
5988 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5989 vect[k] = data_ref;
5992 for (k = 0; k < 3; k++)
5994 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5995 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5996 vect[k % 3], vect[(k + 1) % 3],
5997 shift1_mask);
5998 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5999 vect_shift[k] = data_ref;
6002 for (k = 0; k < 3; k++)
6004 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6005 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6006 vect_shift[(4 - k) % 3],
6007 vect_shift[(3 - k) % 3],
6008 shift2_mask);
6009 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6010 vect[k] = data_ref;
6013 (*result_chain)[3 - (nelt % 3)] = vect[2];
6015 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6016 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6017 vect[0], shift3_mask);
6018 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6019 (*result_chain)[nelt % 3] = data_ref;
6021 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6022 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6023 vect[1], shift4_mask);
6024 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
6025 (*result_chain)[0] = data_ref;
6026 return true;
6028 return false;
6031 /* Function vect_transform_grouped_load.
6033 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6034 to perform their permutation and ascribe the result vectorized statements to
6035 the scalar statements.
6038 void
6039 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
6040 gimple_stmt_iterator *gsi)
6042 machine_mode mode;
6043 vec<tree> result_chain = vNULL;
6045 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6046 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6047 vectors, that are ready for vector computation. */
6048 result_chain.create (size);
6050 /* If reassociation width for vector type is 2 or greater target machine can
6051 execute 2 or more vector instructions in parallel. Otherwise try to
6052 get chain for loads group using vect_shift_permute_load_chain. */
6053 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
6054 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6055 || pow2p_hwi (size)
6056 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
6057 gsi, &result_chain))
6058 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
6059 vect_record_grouped_load_vectors (stmt, result_chain);
6060 result_chain.release ();
6063 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6064 generated as part of the vectorization of STMT. Assign the statement
6065 for each vector to the associated scalar statement. */
6067 void
6068 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
6070 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
6071 gimple *next_stmt, *new_stmt;
6072 unsigned int i, gap_count;
6073 tree tmp_data_ref;
6075 /* Put a permuted data-ref in the VECTORIZED_STMT field.
6076 Since we scan the chain starting from it's first node, their order
6077 corresponds the order of data-refs in RESULT_CHAIN. */
6078 next_stmt = first_stmt;
6079 gap_count = 1;
6080 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6082 if (!next_stmt)
6083 break;
6085 /* Skip the gaps. Loads created for the gaps will be removed by dead
6086 code elimination pass later. No need to check for the first stmt in
6087 the group, since it always exists.
6088 GROUP_GAP is the number of steps in elements from the previous
6089 access (if there is no gap GROUP_GAP is 1). We skip loads that
6090 correspond to the gaps. */
6091 if (next_stmt != first_stmt
6092 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
6094 gap_count++;
6095 continue;
6098 while (next_stmt)
6100 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
6101 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6102 copies, and we put the new vector statement in the first available
6103 RELATED_STMT. */
6104 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
6105 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
6106 else
6108 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6110 gimple *prev_stmt =
6111 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
6112 gimple *rel_stmt =
6113 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
6114 while (rel_stmt)
6116 prev_stmt = rel_stmt;
6117 rel_stmt =
6118 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
6121 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
6122 new_stmt;
6126 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
6127 gap_count = 1;
6128 /* If NEXT_STMT accesses the same DR as the previous statement,
6129 put the same TMP_DATA_REF as its vectorized statement; otherwise
6130 get the next data-ref from RESULT_CHAIN. */
6131 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
6132 break;
6137 /* Function vect_force_dr_alignment_p.
6139 Returns whether the alignment of a DECL can be forced to be aligned
6140 on ALIGNMENT bit boundary. */
6142 bool
6143 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6145 if (!VAR_P (decl))
6146 return false;
6148 if (decl_in_symtab_p (decl)
6149 && !symtab_node::get (decl)->can_increase_alignment_p ())
6150 return false;
6152 if (TREE_STATIC (decl))
6153 return (alignment <= MAX_OFILE_ALIGNMENT);
6154 else
6155 return (alignment <= MAX_STACK_ALIGNMENT);
6159 /* Return whether the data reference DR is supported with respect to its
6160 alignment.
6161 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6162 it is aligned, i.e., check if it is possible to vectorize it with different
6163 alignment. */
6165 enum dr_alignment_support
6166 vect_supportable_dr_alignment (struct data_reference *dr,
6167 bool check_aligned_accesses)
6169 gimple *stmt = DR_STMT (dr);
6170 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6171 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6172 machine_mode mode = TYPE_MODE (vectype);
6173 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6174 struct loop *vect_loop = NULL;
6175 bool nested_in_vect_loop = false;
6177 if (aligned_access_p (dr) && !check_aligned_accesses)
6178 return dr_aligned;
6180 /* For now assume all conditional loads/stores support unaligned
6181 access without any special code. */
6182 if (is_gimple_call (stmt)
6183 && gimple_call_internal_p (stmt)
6184 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6185 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6186 return dr_unaligned_supported;
6188 if (loop_vinfo)
6190 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6191 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6194 /* Possibly unaligned access. */
6196 /* We can choose between using the implicit realignment scheme (generating
6197 a misaligned_move stmt) and the explicit realignment scheme (generating
6198 aligned loads with a REALIGN_LOAD). There are two variants to the
6199 explicit realignment scheme: optimized, and unoptimized.
6200 We can optimize the realignment only if the step between consecutive
6201 vector loads is equal to the vector size. Since the vector memory
6202 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6203 is guaranteed that the misalignment amount remains the same throughout the
6204 execution of the vectorized loop. Therefore, we can create the
6205 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6206 at the loop preheader.
6208 However, in the case of outer-loop vectorization, when vectorizing a
6209 memory access in the inner-loop nested within the LOOP that is now being
6210 vectorized, while it is guaranteed that the misalignment of the
6211 vectorized memory access will remain the same in different outer-loop
6212 iterations, it is *not* guaranteed that is will remain the same throughout
6213 the execution of the inner-loop. This is because the inner-loop advances
6214 with the original scalar step (and not in steps of VS). If the inner-loop
6215 step happens to be a multiple of VS, then the misalignment remains fixed
6216 and we can use the optimized realignment scheme. For example:
6218 for (i=0; i<N; i++)
6219 for (j=0; j<M; j++)
6220 s += a[i+j];
6222 When vectorizing the i-loop in the above example, the step between
6223 consecutive vector loads is 1, and so the misalignment does not remain
6224 fixed across the execution of the inner-loop, and the realignment cannot
6225 be optimized (as illustrated in the following pseudo vectorized loop):
6227 for (i=0; i<N; i+=4)
6228 for (j=0; j<M; j++){
6229 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6230 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6231 // (assuming that we start from an aligned address).
6234 We therefore have to use the unoptimized realignment scheme:
6236 for (i=0; i<N; i+=4)
6237 for (j=k; j<M; j+=4)
6238 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6239 // that the misalignment of the initial address is
6240 // 0).
6242 The loop can then be vectorized as follows:
6244 for (k=0; k<4; k++){
6245 rt = get_realignment_token (&vp[k]);
6246 for (i=0; i<N; i+=4){
6247 v1 = vp[i+k];
6248 for (j=k; j<M; j+=4){
6249 v2 = vp[i+j+VS-1];
6250 va = REALIGN_LOAD <v1,v2,rt>;
6251 vs += va;
6252 v1 = v2;
6255 } */
6257 if (DR_IS_READ (dr))
6259 bool is_packed = false;
6260 tree type = (TREE_TYPE (DR_REF (dr)));
6262 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6263 && (!targetm.vectorize.builtin_mask_for_load
6264 || targetm.vectorize.builtin_mask_for_load ()))
6266 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6268 /* If we are doing SLP then the accesses need not have the
6269 same alignment, instead it depends on the SLP group size. */
6270 if (loop_vinfo
6271 && STMT_SLP_TYPE (stmt_info)
6272 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6273 * GROUP_SIZE (vinfo_for_stmt
6274 (GROUP_FIRST_ELEMENT (stmt_info))),
6275 TYPE_VECTOR_SUBPARTS (vectype)))
6277 else if (!loop_vinfo
6278 || (nested_in_vect_loop
6279 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6280 GET_MODE_SIZE (TYPE_MODE (vectype)))))
6281 return dr_explicit_realign;
6282 else
6283 return dr_explicit_realign_optimized;
6285 if (!known_alignment_for_access_p (dr))
6286 is_packed = not_size_aligned (DR_REF (dr));
6288 if (targetm.vectorize.support_vector_misalignment
6289 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6290 /* Can't software pipeline the loads, but can at least do them. */
6291 return dr_unaligned_supported;
6293 else
6295 bool is_packed = false;
6296 tree type = (TREE_TYPE (DR_REF (dr)));
6298 if (!known_alignment_for_access_p (dr))
6299 is_packed = not_size_aligned (DR_REF (dr));
6301 if (targetm.vectorize.support_vector_misalignment
6302 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6303 return dr_unaligned_supported;
6306 /* Unsupported. */
6307 return dr_unaligned_unsupported;