* tree-loop-distribution.c (struct partition): New field recording
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob907f35e6703b7e3afa5a3ce1feb897d1c3f85feb
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
53 #include "tree-cfg.h"
55 /* Return true if load- or store-lanes optab OPTAB is implemented for
56 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
58 static bool
59 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
60 tree vectype, unsigned HOST_WIDE_INT count)
62 machine_mode mode, array_mode;
63 bool limit_p;
65 mode = TYPE_MODE (vectype);
66 limit_p = !targetm.array_mode_supported_p (mode, count);
67 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
68 MODE_INT, limit_p);
70 if (array_mode == BLKmode)
72 if (dump_enabled_p ())
73 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
74 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
75 GET_MODE_NAME (mode), count);
76 return false;
79 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
81 if (dump_enabled_p ())
82 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
83 "cannot use %s<%s><%s>\n", name,
84 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
85 return false;
88 if (dump_enabled_p ())
89 dump_printf_loc (MSG_NOTE, vect_location,
90 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
91 GET_MODE_NAME (mode));
93 return true;
97 /* Return the smallest scalar part of STMT.
98 This is used to determine the vectype of the stmt. We generally set the
99 vectype according to the type of the result (lhs). For stmts whose
100 result-type is different than the type of the arguments (e.g., demotion,
101 promotion), vectype will be reset appropriately (later). Note that we have
102 to visit the smallest datatype in this function, because that determines the
103 VF. If the smallest datatype in the loop is present only as the rhs of a
104 promotion operation - we'd miss it.
105 Such a case, where a variable of this datatype does not appear in the lhs
106 anywhere in the loop, can only occur if it's an invariant: e.g.:
107 'int_x = (int) short_inv', which we'd expect to have been optimized away by
108 invariant motion. However, we cannot rely on invariant motion to always
109 take invariants out of the loop, and so in the case of promotion we also
110 have to check the rhs.
111 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
112 types. */
114 tree
115 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
116 HOST_WIDE_INT *rhs_size_unit)
118 tree scalar_type = gimple_expr_type (stmt);
119 HOST_WIDE_INT lhs, rhs;
121 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
123 if (is_gimple_assign (stmt)
124 && (gimple_assign_cast_p (stmt)
125 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
126 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
127 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
129 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
131 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
132 if (rhs < lhs)
133 scalar_type = rhs_type;
136 *lhs_size_unit = lhs;
137 *rhs_size_unit = rhs;
138 return scalar_type;
142 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
143 tested at run-time. Return TRUE if DDR was successfully inserted.
144 Return false if versioning is not supported. */
146 static bool
147 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
149 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
151 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
152 return false;
154 if (!runtime_alias_check_p (ddr, loop,
155 optimize_loop_nest_for_speed_p (loop)))
156 return false;
158 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
159 return true;
163 /* Function vect_analyze_data_ref_dependence.
165 Return TRUE if there (might) exist a dependence between a memory-reference
166 DRA and a memory-reference DRB. When versioning for alias may check a
167 dependence at run-time, return FALSE. Adjust *MAX_VF according to
168 the data dependence. */
170 static bool
171 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
172 loop_vec_info loop_vinfo, int *max_vf)
174 unsigned int i;
175 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
176 struct data_reference *dra = DDR_A (ddr);
177 struct data_reference *drb = DDR_B (ddr);
178 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
179 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
180 lambda_vector dist_v;
181 unsigned int loop_depth;
183 /* In loop analysis all data references should be vectorizable. */
184 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
185 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
186 gcc_unreachable ();
188 /* Independent data accesses. */
189 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
190 return false;
192 if (dra == drb
193 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
194 return false;
196 /* We do not have to consider dependences between accesses that belong
197 to the same group. */
198 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
199 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
200 return false;
202 /* Even if we have an anti-dependence then, as the vectorized loop covers at
203 least two scalar iterations, there is always also a true dependence.
204 As the vectorizer does not re-order loads and stores we can ignore
205 the anti-dependence if TBAA can disambiguate both DRs similar to the
206 case with known negative distance anti-dependences (positive
207 distance anti-dependences would violate TBAA constraints). */
208 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
209 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
210 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
211 get_alias_set (DR_REF (drb))))
212 return false;
214 /* Unknown data dependence. */
215 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
217 /* If user asserted safelen consecutive iterations can be
218 executed concurrently, assume independence. */
219 if (loop->safelen >= 2)
221 if (loop->safelen < *max_vf)
222 *max_vf = loop->safelen;
223 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
224 return false;
227 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
228 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
230 if (dump_enabled_p ())
232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
233 "versioning for alias not supported for: "
234 "can't determine dependence between ");
235 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
236 DR_REF (dra));
237 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
238 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
239 DR_REF (drb));
240 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
242 return true;
245 if (dump_enabled_p ())
247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
248 "versioning for alias required: "
249 "can't determine dependence between ");
250 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
251 DR_REF (dra));
252 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
253 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
254 DR_REF (drb));
255 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
258 /* Add to list of ddrs that need to be tested at run-time. */
259 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
262 /* Known data dependence. */
263 if (DDR_NUM_DIST_VECTS (ddr) == 0)
265 /* If user asserted safelen consecutive iterations can be
266 executed concurrently, assume independence. */
267 if (loop->safelen >= 2)
269 if (loop->safelen < *max_vf)
270 *max_vf = loop->safelen;
271 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
272 return false;
275 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
276 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
278 if (dump_enabled_p ())
280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
281 "versioning for alias not supported for: "
282 "bad dist vector for ");
283 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
284 DR_REF (dra));
285 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
286 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
287 DR_REF (drb));
288 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
290 return true;
293 if (dump_enabled_p ())
295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
296 "versioning for alias required: "
297 "bad dist vector for ");
298 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
299 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
300 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
301 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
303 /* Add to list of ddrs that need to be tested at run-time. */
304 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
307 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
308 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
310 int dist = dist_v[loop_depth];
312 if (dump_enabled_p ())
313 dump_printf_loc (MSG_NOTE, vect_location,
314 "dependence distance = %d.\n", dist);
316 if (dist == 0)
318 if (dump_enabled_p ())
320 dump_printf_loc (MSG_NOTE, vect_location,
321 "dependence distance == 0 between ");
322 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
323 dump_printf (MSG_NOTE, " and ");
324 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
325 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
328 /* When we perform grouped accesses and perform implicit CSE
329 by detecting equal accesses and doing disambiguation with
330 runtime alias tests like for
331 .. = a[i];
332 .. = a[i+1];
333 a[i] = ..;
334 a[i+1] = ..;
335 *p = ..;
336 .. = a[i];
337 .. = a[i+1];
338 where we will end up loading { a[i], a[i+1] } once, make
339 sure that inserting group loads before the first load and
340 stores after the last store will do the right thing.
341 Similar for groups like
342 a[i] = ...;
343 ... = a[i];
344 a[i+1] = ...;
345 where loads from the group interleave with the store. */
346 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
347 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
349 gimple *earlier_stmt;
350 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
351 if (DR_IS_WRITE
352 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
354 if (dump_enabled_p ())
355 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
356 "READ_WRITE dependence in interleaving."
357 "\n");
358 return true;
362 continue;
365 if (dist > 0 && DDR_REVERSED_P (ddr))
367 /* If DDR_REVERSED_P the order of the data-refs in DDR was
368 reversed (to make distance vector positive), and the actual
369 distance is negative. */
370 if (dump_enabled_p ())
371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
372 "dependence distance negative.\n");
373 /* Record a negative dependence distance to later limit the
374 amount of stmt copying / unrolling we can perform.
375 Only need to handle read-after-write dependence. */
376 if (DR_IS_READ (drb)
377 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
378 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
379 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
380 continue;
383 if (abs (dist) >= 2
384 && abs (dist) < *max_vf)
386 /* The dependence distance requires reduction of the maximal
387 vectorization factor. */
388 *max_vf = abs (dist);
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_NOTE, vect_location,
391 "adjusting maximal vectorization factor to %i\n",
392 *max_vf);
395 if (abs (dist) >= *max_vf)
397 /* Dependence distance does not create dependence, as far as
398 vectorization is concerned, in this case. */
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location,
401 "dependence distance >= VF.\n");
402 continue;
405 if (dump_enabled_p ())
407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
408 "not vectorized, possible dependence "
409 "between data-refs ");
410 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
411 dump_printf (MSG_NOTE, " and ");
412 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
413 dump_printf (MSG_NOTE, "\n");
416 return true;
419 return false;
422 /* Function vect_analyze_data_ref_dependences.
424 Examine all the data references in the loop, and make sure there do not
425 exist any data dependences between them. Set *MAX_VF according to
426 the maximum vectorization factor the data dependences allow. */
428 bool
429 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
431 unsigned int i;
432 struct data_dependence_relation *ddr;
434 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE, vect_location,
436 "=== vect_analyze_data_ref_dependences ===\n");
438 LOOP_VINFO_DDRS (loop_vinfo)
439 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
440 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
441 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
442 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
443 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
444 &LOOP_VINFO_DDRS (loop_vinfo),
445 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
446 return false;
448 /* For epilogues we either have no aliases or alias versioning
449 was applied to original loop. Therefore we may just get max_vf
450 using VF of original loop. */
451 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
452 *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
453 else
454 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
455 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
456 return false;
458 return true;
462 /* Function vect_slp_analyze_data_ref_dependence.
464 Return TRUE if there (might) exist a dependence between a memory-reference
465 DRA and a memory-reference DRB. When versioning for alias may check a
466 dependence at run-time, return FALSE. Adjust *MAX_VF according to
467 the data dependence. */
469 static bool
470 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
472 struct data_reference *dra = DDR_A (ddr);
473 struct data_reference *drb = DDR_B (ddr);
475 /* We need to check dependences of statements marked as unvectorizable
476 as well, they still can prohibit vectorization. */
478 /* Independent data accesses. */
479 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
480 return false;
482 if (dra == drb)
483 return false;
485 /* Read-read is OK. */
486 if (DR_IS_READ (dra) && DR_IS_READ (drb))
487 return false;
489 /* If dra and drb are part of the same interleaving chain consider
490 them independent. */
491 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
492 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
493 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
494 return false;
496 /* Unknown data dependence. */
497 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
499 if (dump_enabled_p ())
501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
502 "can't determine dependence between ");
503 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
504 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
505 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
506 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
509 else if (dump_enabled_p ())
511 dump_printf_loc (MSG_NOTE, vect_location,
512 "determined dependence between ");
513 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
514 dump_printf (MSG_NOTE, " and ");
515 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
516 dump_printf (MSG_NOTE, "\n");
519 return true;
523 /* Analyze dependences involved in the transform of SLP NODE. STORES
524 contain the vector of scalar stores of this instance if we are
525 disambiguating the loads. */
527 static bool
528 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
529 vec<gimple *> stores, gimple *last_store)
531 /* This walks over all stmts involved in the SLP load/store done
532 in NODE verifying we can sink them up to the last stmt in the
533 group. */
534 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
535 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
537 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
538 if (access == last_access)
539 continue;
540 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
541 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
542 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
544 gimple *stmt = gsi_stmt (gsi);
545 if (! gimple_vuse (stmt)
546 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
547 continue;
549 /* If we couldn't record a (single) data reference for this
550 stmt we have to give up. */
551 /* ??? Here and below if dependence analysis fails we can resort
552 to the alias oracle which can handle more kinds of stmts. */
553 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
554 if (!dr_b)
555 return false;
557 bool dependent = false;
558 /* If we run into a store of this same instance (we've just
559 marked those) then delay dependence checking until we run
560 into the last store because this is where it will have
561 been sunk to (and we verify if we can do that as well). */
562 if (gimple_visited_p (stmt))
564 if (stmt != last_store)
565 continue;
566 unsigned i;
567 gimple *store;
568 FOR_EACH_VEC_ELT (stores, i, store)
570 data_reference *store_dr
571 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
572 ddr_p ddr = initialize_data_dependence_relation
573 (dr_a, store_dr, vNULL);
574 dependent = vect_slp_analyze_data_ref_dependence (ddr);
575 free_dependence_relation (ddr);
576 if (dependent)
577 break;
580 else
582 ddr_p ddr = initialize_data_dependence_relation (dr_a,
583 dr_b, vNULL);
584 dependent = vect_slp_analyze_data_ref_dependence (ddr);
585 free_dependence_relation (ddr);
587 if (dependent)
588 return false;
591 return true;
595 /* Function vect_analyze_data_ref_dependences.
597 Examine all the data references in the basic-block, and make sure there
598 do not exist any data dependences between them. Set *MAX_VF according to
599 the maximum vectorization factor the data dependences allow. */
601 bool
602 vect_slp_analyze_instance_dependence (slp_instance instance)
604 if (dump_enabled_p ())
605 dump_printf_loc (MSG_NOTE, vect_location,
606 "=== vect_slp_analyze_instance_dependence ===\n");
608 /* The stores of this instance are at the root of the SLP tree. */
609 slp_tree store = SLP_INSTANCE_TREE (instance);
610 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
611 store = NULL;
613 /* Verify we can sink stores to the vectorized stmt insert location. */
614 gimple *last_store = NULL;
615 if (store)
617 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
618 return false;
620 /* Mark stores in this instance and remember the last one. */
621 last_store = vect_find_last_scalar_stmt_in_slp (store);
622 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
623 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
626 bool res = true;
628 /* Verify we can sink loads to the vectorized stmt insert location,
629 special-casing stores of this instance. */
630 slp_tree load;
631 unsigned int i;
632 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
633 if (! vect_slp_analyze_node_dependences (instance, load,
634 store
635 ? SLP_TREE_SCALAR_STMTS (store)
636 : vNULL, last_store))
638 res = false;
639 break;
642 /* Unset the visited flag. */
643 if (store)
644 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
645 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
647 return res;
650 /* Function vect_compute_data_ref_alignment
652 Compute the misalignment of the data reference DR.
654 Output:
655 1. If during the misalignment computation it is found that the data reference
656 cannot be vectorized then false is returned.
657 2. DR_MISALIGNMENT (DR) is defined.
659 FOR NOW: No analysis is actually performed. Misalignment is calculated
660 only for trivial cases. TODO. */
662 bool
663 vect_compute_data_ref_alignment (struct data_reference *dr)
665 gimple *stmt = DR_STMT (dr);
666 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
667 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
668 struct loop *loop = NULL;
669 tree ref = DR_REF (dr);
670 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
672 if (dump_enabled_p ())
673 dump_printf_loc (MSG_NOTE, vect_location,
674 "vect_compute_data_ref_alignment:\n");
676 if (loop_vinfo)
677 loop = LOOP_VINFO_LOOP (loop_vinfo);
679 /* Initialize misalignment to unknown. */
680 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
682 innermost_loop_behavior *drb = vect_dr_behavior (dr);
683 bool step_preserves_misalignment_p;
685 /* No step for BB vectorization. */
686 if (!loop)
688 gcc_assert (integer_zerop (drb->step));
689 step_preserves_misalignment_p = true;
692 /* In case the dataref is in an inner-loop of the loop that is being
693 vectorized (LOOP), we use the base and misalignment information
694 relative to the outer-loop (LOOP). This is ok only if the misalignment
695 stays the same throughout the execution of the inner-loop, which is why
696 we have to check that the stride of the dataref in the inner-loop evenly
697 divides by the vector size. */
698 else if (nested_in_vect_loop_p (loop, stmt))
700 step_preserves_misalignment_p
701 = (DR_STEP_ALIGNMENT (dr)
702 % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
704 if (dump_enabled_p ())
706 if (step_preserves_misalignment_p)
707 dump_printf_loc (MSG_NOTE, vect_location,
708 "inner step divides the vector-size.\n");
709 else
710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
711 "inner step doesn't divide the vector-size.\n");
715 /* Similarly we can only use base and misalignment information relative to
716 an innermost loop if the misalignment stays the same throughout the
717 execution of the loop. As above, this is the case if the stride of
718 the dataref evenly divides by the vector size. */
719 else
721 unsigned vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
722 step_preserves_misalignment_p
723 = ((DR_STEP_ALIGNMENT (dr) * vf)
724 % GET_MODE_SIZE (TYPE_MODE (vectype))) == 0;
726 if (!step_preserves_misalignment_p && dump_enabled_p ())
727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
728 "step doesn't divide the vector-size.\n");
731 unsigned int base_alignment = drb->base_alignment;
732 unsigned int base_misalignment = drb->base_misalignment;
733 unsigned HOST_WIDE_INT vector_alignment = TYPE_ALIGN_UNIT (vectype);
735 if (drb->offset_alignment < vector_alignment
736 || !step_preserves_misalignment_p
737 /* We need to know whether the step wrt the vectorized loop is
738 negative when computing the starting misalignment below. */
739 || TREE_CODE (drb->step) != INTEGER_CST)
741 if (dump_enabled_p ())
743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
744 "Unknown alignment for access: ");
745 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
746 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
748 return true;
751 if (base_alignment < vector_alignment)
753 tree base = drb->base_address;
754 if (TREE_CODE (base) == ADDR_EXPR)
755 base = TREE_OPERAND (base, 0);
756 if (!vect_can_force_dr_alignment_p (base,
757 vector_alignment * BITS_PER_UNIT))
759 if (dump_enabled_p ())
761 dump_printf_loc (MSG_NOTE, vect_location,
762 "can't force alignment of ref: ");
763 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
764 dump_printf (MSG_NOTE, "\n");
766 return true;
769 if (DECL_USER_ALIGN (base))
771 if (dump_enabled_p ())
773 dump_printf_loc (MSG_NOTE, vect_location,
774 "not forcing alignment of user-aligned "
775 "variable: ");
776 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
777 dump_printf (MSG_NOTE, "\n");
779 return true;
782 /* Force the alignment of the decl.
783 NOTE: This is the only change to the code we make during
784 the analysis phase, before deciding to vectorize the loop. */
785 if (dump_enabled_p ())
787 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
788 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
789 dump_printf (MSG_NOTE, "\n");
792 DR_VECT_AUX (dr)->base_decl = base;
793 DR_VECT_AUX (dr)->base_misaligned = true;
794 base_misalignment = 0;
796 unsigned int misalignment = (base_misalignment
797 + TREE_INT_CST_LOW (drb->init));
799 /* If this is a backward running DR then first access in the larger
800 vectype actually is N-1 elements before the address in the DR.
801 Adjust misalign accordingly. */
802 if (tree_int_cst_sgn (drb->step) < 0)
803 /* PLUS because STEP is negative. */
804 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
805 * TREE_INT_CST_LOW (drb->step));
807 SET_DR_MISALIGNMENT (dr, misalignment & (vector_alignment - 1));
809 if (dump_enabled_p ())
811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
812 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
813 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
814 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
817 return true;
821 /* Function vect_update_misalignment_for_peel.
822 Sets DR's misalignment
823 - to 0 if it has the same alignment as DR_PEEL,
824 - to the misalignment computed using NPEEL if DR's salignment is known,
825 - to -1 (unknown) otherwise.
827 DR - the data reference whose misalignment is to be adjusted.
828 DR_PEEL - the data reference whose misalignment is being made
829 zero in the vector loop by the peel.
830 NPEEL - the number of iterations in the peel loop if the misalignment
831 of DR_PEEL is known at compile time. */
833 static void
834 vect_update_misalignment_for_peel (struct data_reference *dr,
835 struct data_reference *dr_peel, int npeel)
837 unsigned int i;
838 vec<dr_p> same_aligned_drs;
839 struct data_reference *current_dr;
840 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
841 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
842 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
843 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
845 /* For interleaved data accesses the step in the loop must be multiplied by
846 the size of the interleaving group. */
847 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
848 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
849 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
850 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
852 /* It can be assumed that the data refs with the same alignment as dr_peel
853 are aligned in the vector loop. */
854 same_aligned_drs
855 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
856 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
858 if (current_dr != dr)
859 continue;
860 gcc_assert (!known_alignment_for_access_p (dr)
861 || !known_alignment_for_access_p (dr_peel)
862 || (DR_MISALIGNMENT (dr) / dr_size
863 == DR_MISALIGNMENT (dr_peel) / dr_peel_size));
864 SET_DR_MISALIGNMENT (dr, 0);
865 return;
868 if (known_alignment_for_access_p (dr)
869 && known_alignment_for_access_p (dr_peel))
871 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
872 int misal = DR_MISALIGNMENT (dr);
873 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
874 misal += negative ? -npeel * dr_size : npeel * dr_size;
875 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
876 SET_DR_MISALIGNMENT (dr, misal);
877 return;
880 if (dump_enabled_p ())
881 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
882 "to unknown (-1).\n");
883 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
887 /* Function verify_data_ref_alignment
889 Return TRUE if DR can be handled with respect to alignment. */
891 static bool
892 verify_data_ref_alignment (data_reference_p dr)
894 enum dr_alignment_support supportable_dr_alignment
895 = vect_supportable_dr_alignment (dr, false);
896 if (!supportable_dr_alignment)
898 if (dump_enabled_p ())
900 if (DR_IS_READ (dr))
901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
902 "not vectorized: unsupported unaligned load.");
903 else
904 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
905 "not vectorized: unsupported unaligned "
906 "store.");
908 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
909 DR_REF (dr));
910 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
912 return false;
915 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
916 dump_printf_loc (MSG_NOTE, vect_location,
917 "Vectorizing an unaligned access.\n");
919 return true;
922 /* Function vect_verify_datarefs_alignment
924 Return TRUE if all data references in the loop can be
925 handled with respect to alignment. */
927 bool
928 vect_verify_datarefs_alignment (loop_vec_info vinfo)
930 vec<data_reference_p> datarefs = vinfo->datarefs;
931 struct data_reference *dr;
932 unsigned int i;
934 FOR_EACH_VEC_ELT (datarefs, i, dr)
936 gimple *stmt = DR_STMT (dr);
937 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
939 if (!STMT_VINFO_RELEVANT_P (stmt_info))
940 continue;
942 /* For interleaving, only the alignment of the first access matters. */
943 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
944 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
945 continue;
947 /* Strided accesses perform only component accesses, alignment is
948 irrelevant for them. */
949 if (STMT_VINFO_STRIDED_P (stmt_info)
950 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
951 continue;
953 if (! verify_data_ref_alignment (dr))
954 return false;
957 return true;
960 /* Given an memory reference EXP return whether its alignment is less
961 than its size. */
963 static bool
964 not_size_aligned (tree exp)
966 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
967 return true;
969 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
970 > get_object_alignment (exp));
973 /* Function vector_alignment_reachable_p
975 Return true if vector alignment for DR is reachable by peeling
976 a few loop iterations. Return false otherwise. */
978 static bool
979 vector_alignment_reachable_p (struct data_reference *dr)
981 gimple *stmt = DR_STMT (dr);
982 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
983 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
985 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
987 /* For interleaved access we peel only if number of iterations in
988 the prolog loop ({VF - misalignment}), is a multiple of the
989 number of the interleaved accesses. */
990 int elem_size, mis_in_elements;
991 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
993 /* FORNOW: handle only known alignment. */
994 if (!known_alignment_for_access_p (dr))
995 return false;
997 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
998 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1000 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1001 return false;
1004 /* If misalignment is known at the compile time then allow peeling
1005 only if natural alignment is reachable through peeling. */
1006 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1008 HOST_WIDE_INT elmsize =
1009 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1010 if (dump_enabled_p ())
1012 dump_printf_loc (MSG_NOTE, vect_location,
1013 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1014 dump_printf (MSG_NOTE,
1015 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1017 if (DR_MISALIGNMENT (dr) % elmsize)
1019 if (dump_enabled_p ())
1020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1021 "data size does not divide the misalignment.\n");
1022 return false;
1026 if (!known_alignment_for_access_p (dr))
1028 tree type = TREE_TYPE (DR_REF (dr));
1029 bool is_packed = not_size_aligned (DR_REF (dr));
1030 if (dump_enabled_p ())
1031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1032 "Unknown misalignment, %snaturally aligned\n",
1033 is_packed ? "not " : "");
1034 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1037 return true;
1041 /* Calculate the cost of the memory access represented by DR. */
1043 static void
1044 vect_get_data_access_cost (struct data_reference *dr,
1045 unsigned int *inside_cost,
1046 unsigned int *outside_cost,
1047 stmt_vector_for_cost *body_cost_vec)
1049 gimple *stmt = DR_STMT (dr);
1050 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1051 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1052 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1053 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1054 int ncopies = MAX (1, vf / nunits); /* TODO: Handle SLP properly */
1056 if (DR_IS_READ (dr))
1057 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1058 NULL, body_cost_vec, false);
1059 else
1060 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1062 if (dump_enabled_p ())
1063 dump_printf_loc (MSG_NOTE, vect_location,
1064 "vect_get_data_access_cost: inside_cost = %d, "
1065 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1069 typedef struct _vect_peel_info
1071 struct data_reference *dr;
1072 int npeel;
1073 unsigned int count;
1074 } *vect_peel_info;
1076 typedef struct _vect_peel_extended_info
1078 struct _vect_peel_info peel_info;
1079 unsigned int inside_cost;
1080 unsigned int outside_cost;
1081 stmt_vector_for_cost body_cost_vec;
1082 } *vect_peel_extended_info;
1085 /* Peeling hashtable helpers. */
1087 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1089 static inline hashval_t hash (const _vect_peel_info *);
1090 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1093 inline hashval_t
1094 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1096 return (hashval_t) peel_info->npeel;
1099 inline bool
1100 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1102 return (a->npeel == b->npeel);
1106 /* Insert DR into peeling hash table with NPEEL as key. */
1108 static void
1109 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1110 loop_vec_info loop_vinfo, struct data_reference *dr,
1111 int npeel)
1113 struct _vect_peel_info elem, *slot;
1114 _vect_peel_info **new_slot;
1115 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1117 elem.npeel = npeel;
1118 slot = peeling_htab->find (&elem);
1119 if (slot)
1120 slot->count++;
1121 else
1123 slot = XNEW (struct _vect_peel_info);
1124 slot->npeel = npeel;
1125 slot->dr = dr;
1126 slot->count = 1;
1127 new_slot = peeling_htab->find_slot (slot, INSERT);
1128 *new_slot = slot;
1131 if (!supportable_dr_alignment
1132 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1133 slot->count += VECT_MAX_COST;
1137 /* Traverse peeling hash table to find peeling option that aligns maximum
1138 number of data accesses. */
1141 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1142 _vect_peel_extended_info *max)
1144 vect_peel_info elem = *slot;
1146 if (elem->count > max->peel_info.count
1147 || (elem->count == max->peel_info.count
1148 && max->peel_info.npeel > elem->npeel))
1150 max->peel_info.npeel = elem->npeel;
1151 max->peel_info.count = elem->count;
1152 max->peel_info.dr = elem->dr;
1155 return 1;
1158 /* Get the costs of peeling NPEEL iterations checking data access costs
1159 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1160 misalignment will be zero after peeling. */
1162 static void
1163 vect_get_peeling_costs_all_drs (struct data_reference *dr0,
1164 unsigned int *inside_cost,
1165 unsigned int *outside_cost,
1166 stmt_vector_for_cost *body_cost_vec,
1167 unsigned int npeel,
1168 bool unknown_misalignment)
1170 gimple *stmt = DR_STMT (dr0);
1171 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1172 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1173 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1175 unsigned i;
1176 data_reference *dr;
1178 FOR_EACH_VEC_ELT (datarefs, i, dr)
1180 stmt = DR_STMT (dr);
1181 stmt_info = vinfo_for_stmt (stmt);
1182 /* For interleaving, only the alignment of the first access
1183 matters. */
1184 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1185 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1186 continue;
1188 /* Strided accesses perform only component accesses, alignment is
1189 irrelevant for them. */
1190 if (STMT_VINFO_STRIDED_P (stmt_info)
1191 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1192 continue;
1194 int save_misalignment;
1195 save_misalignment = DR_MISALIGNMENT (dr);
1196 if (unknown_misalignment && dr == dr0)
1197 SET_DR_MISALIGNMENT (dr, 0);
1198 else
1199 vect_update_misalignment_for_peel (dr, dr0, npeel);
1200 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1201 body_cost_vec);
1202 SET_DR_MISALIGNMENT (dr, save_misalignment);
1206 /* Traverse peeling hash table and calculate cost for each peeling option.
1207 Find the one with the lowest cost. */
1210 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1211 _vect_peel_extended_info *min)
1213 vect_peel_info elem = *slot;
1214 int dummy;
1215 unsigned int inside_cost = 0, outside_cost = 0;
1216 gimple *stmt = DR_STMT (elem->dr);
1217 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1218 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1219 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1220 epilogue_cost_vec;
1222 prologue_cost_vec.create (2);
1223 body_cost_vec.create (2);
1224 epilogue_cost_vec.create (2);
1226 vect_get_peeling_costs_all_drs (elem->dr, &inside_cost, &outside_cost,
1227 &body_cost_vec, elem->npeel, false);
1229 outside_cost += vect_get_known_peeling_cost
1230 (loop_vinfo, elem->npeel, &dummy,
1231 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1232 &prologue_cost_vec, &epilogue_cost_vec);
1234 /* Prologue and epilogue costs are added to the target model later.
1235 These costs depend only on the scalar iteration cost, the
1236 number of peeling iterations finally chosen, and the number of
1237 misaligned statements. So discard the information found here. */
1238 prologue_cost_vec.release ();
1239 epilogue_cost_vec.release ();
1241 if (inside_cost < min->inside_cost
1242 || (inside_cost == min->inside_cost
1243 && outside_cost < min->outside_cost))
1245 min->inside_cost = inside_cost;
1246 min->outside_cost = outside_cost;
1247 min->body_cost_vec.release ();
1248 min->body_cost_vec = body_cost_vec;
1249 min->peel_info.dr = elem->dr;
1250 min->peel_info.npeel = elem->npeel;
1251 min->peel_info.count = elem->count;
1253 else
1254 body_cost_vec.release ();
1256 return 1;
1260 /* Choose best peeling option by traversing peeling hash table and either
1261 choosing an option with the lowest cost (if cost model is enabled) or the
1262 option that aligns as many accesses as possible. */
1264 static struct _vect_peel_extended_info
1265 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1266 loop_vec_info loop_vinfo,
1267 unsigned int *npeel,
1268 stmt_vector_for_cost *body_cost_vec)
1270 struct _vect_peel_extended_info res;
1272 res.peel_info.dr = NULL;
1273 res.body_cost_vec = stmt_vector_for_cost ();
1275 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1277 res.inside_cost = INT_MAX;
1278 res.outside_cost = INT_MAX;
1279 peeling_htab->traverse <_vect_peel_extended_info *,
1280 vect_peeling_hash_get_lowest_cost> (&res);
1282 else
1284 res.peel_info.count = 0;
1285 peeling_htab->traverse <_vect_peel_extended_info *,
1286 vect_peeling_hash_get_most_frequent> (&res);
1287 res.inside_cost = 0;
1288 res.outside_cost = 0;
1291 *npeel = res.peel_info.npeel;
1292 *body_cost_vec = res.body_cost_vec;
1293 return res;
1296 /* Return true if the new peeling NPEEL is supported. */
1298 static bool
1299 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1300 unsigned npeel)
1302 unsigned i;
1303 struct data_reference *dr = NULL;
1304 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1305 gimple *stmt;
1306 stmt_vec_info stmt_info;
1307 enum dr_alignment_support supportable_dr_alignment;
1309 /* Ensure that all data refs can be vectorized after the peel. */
1310 FOR_EACH_VEC_ELT (datarefs, i, dr)
1312 int save_misalignment;
1314 if (dr == dr0)
1315 continue;
1317 stmt = DR_STMT (dr);
1318 stmt_info = vinfo_for_stmt (stmt);
1319 /* For interleaving, only the alignment of the first access
1320 matters. */
1321 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1322 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1323 continue;
1325 /* Strided accesses perform only component accesses, alignment is
1326 irrelevant for them. */
1327 if (STMT_VINFO_STRIDED_P (stmt_info)
1328 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1329 continue;
1331 save_misalignment = DR_MISALIGNMENT (dr);
1332 vect_update_misalignment_for_peel (dr, dr0, npeel);
1333 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1334 SET_DR_MISALIGNMENT (dr, save_misalignment);
1336 if (!supportable_dr_alignment)
1337 return false;
1340 return true;
1343 /* Function vect_enhance_data_refs_alignment
1345 This pass will use loop versioning and loop peeling in order to enhance
1346 the alignment of data references in the loop.
1348 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1349 original loop is to be vectorized. Any other loops that are created by
1350 the transformations performed in this pass - are not supposed to be
1351 vectorized. This restriction will be relaxed.
1353 This pass will require a cost model to guide it whether to apply peeling
1354 or versioning or a combination of the two. For example, the scheme that
1355 intel uses when given a loop with several memory accesses, is as follows:
1356 choose one memory access ('p') which alignment you want to force by doing
1357 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1358 other accesses are not necessarily aligned, or (2) use loop versioning to
1359 generate one loop in which all accesses are aligned, and another loop in
1360 which only 'p' is necessarily aligned.
1362 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1363 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1364 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1366 Devising a cost model is the most critical aspect of this work. It will
1367 guide us on which access to peel for, whether to use loop versioning, how
1368 many versions to create, etc. The cost model will probably consist of
1369 generic considerations as well as target specific considerations (on
1370 powerpc for example, misaligned stores are more painful than misaligned
1371 loads).
1373 Here are the general steps involved in alignment enhancements:
1375 -- original loop, before alignment analysis:
1376 for (i=0; i<N; i++){
1377 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1378 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1381 -- After vect_compute_data_refs_alignment:
1382 for (i=0; i<N; i++){
1383 x = q[i]; # DR_MISALIGNMENT(q) = 3
1384 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1387 -- Possibility 1: we do loop versioning:
1388 if (p is aligned) {
1389 for (i=0; i<N; i++){ # loop 1A
1390 x = q[i]; # DR_MISALIGNMENT(q) = 3
1391 p[i] = y; # DR_MISALIGNMENT(p) = 0
1394 else {
1395 for (i=0; i<N; i++){ # loop 1B
1396 x = q[i]; # DR_MISALIGNMENT(q) = 3
1397 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1401 -- Possibility 2: we do loop peeling:
1402 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1403 x = q[i];
1404 p[i] = y;
1406 for (i = 3; i < N; i++){ # loop 2A
1407 x = q[i]; # DR_MISALIGNMENT(q) = 0
1408 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1411 -- Possibility 3: combination of loop peeling and versioning:
1412 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1413 x = q[i];
1414 p[i] = y;
1416 if (p is aligned) {
1417 for (i = 3; i<N; i++){ # loop 3A
1418 x = q[i]; # DR_MISALIGNMENT(q) = 0
1419 p[i] = y; # DR_MISALIGNMENT(p) = 0
1422 else {
1423 for (i = 3; i<N; i++){ # loop 3B
1424 x = q[i]; # DR_MISALIGNMENT(q) = 0
1425 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1429 These loops are later passed to loop_transform to be vectorized. The
1430 vectorizer will use the alignment information to guide the transformation
1431 (whether to generate regular loads/stores, or with special handling for
1432 misalignment). */
1434 bool
1435 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1437 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1438 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1439 enum dr_alignment_support supportable_dr_alignment;
1440 struct data_reference *dr0 = NULL, *first_store = NULL;
1441 struct data_reference *dr;
1442 unsigned int i, j;
1443 bool do_peeling = false;
1444 bool do_versioning = false;
1445 bool stat;
1446 gimple *stmt;
1447 stmt_vec_info stmt_info;
1448 unsigned int npeel = 0;
1449 bool one_misalignment_known = false;
1450 bool one_misalignment_unknown = false;
1451 bool one_dr_unsupportable = false;
1452 struct data_reference *unsupportable_dr = NULL;
1453 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1454 unsigned possible_npeel_number = 1;
1455 tree vectype;
1456 unsigned int nelements, mis, same_align_drs_max = 0;
1457 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1458 hash_table<peel_info_hasher> peeling_htab (1);
1460 if (dump_enabled_p ())
1461 dump_printf_loc (MSG_NOTE, vect_location,
1462 "=== vect_enhance_data_refs_alignment ===\n");
1464 /* Reset data so we can safely be called multiple times. */
1465 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1466 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1468 /* While cost model enhancements are expected in the future, the high level
1469 view of the code at this time is as follows:
1471 A) If there is a misaligned access then see if peeling to align
1472 this access can make all data references satisfy
1473 vect_supportable_dr_alignment. If so, update data structures
1474 as needed and return true.
1476 B) If peeling wasn't possible and there is a data reference with an
1477 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1478 then see if loop versioning checks can be used to make all data
1479 references satisfy vect_supportable_dr_alignment. If so, update
1480 data structures as needed and return true.
1482 C) If neither peeling nor versioning were successful then return false if
1483 any data reference does not satisfy vect_supportable_dr_alignment.
1485 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1487 Note, Possibility 3 above (which is peeling and versioning together) is not
1488 being done at this time. */
1490 /* (1) Peeling to force alignment. */
1492 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1493 Considerations:
1494 + How many accesses will become aligned due to the peeling
1495 - How many accesses will become unaligned due to the peeling,
1496 and the cost of misaligned accesses.
1497 - The cost of peeling (the extra runtime checks, the increase
1498 in code size). */
1500 FOR_EACH_VEC_ELT (datarefs, i, dr)
1502 stmt = DR_STMT (dr);
1503 stmt_info = vinfo_for_stmt (stmt);
1505 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1506 continue;
1508 /* For interleaving, only the alignment of the first access
1509 matters. */
1510 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1511 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1512 continue;
1514 /* For invariant accesses there is nothing to enhance. */
1515 if (integer_zerop (DR_STEP (dr)))
1516 continue;
1518 /* Strided accesses perform only component accesses, alignment is
1519 irrelevant for them. */
1520 if (STMT_VINFO_STRIDED_P (stmt_info)
1521 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1522 continue;
1524 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1525 do_peeling = vector_alignment_reachable_p (dr);
1526 if (do_peeling)
1528 if (known_alignment_for_access_p (dr))
1530 unsigned int npeel_tmp = 0;
1531 bool negative = tree_int_cst_compare (DR_STEP (dr),
1532 size_zero_node) < 0;
1534 vectype = STMT_VINFO_VECTYPE (stmt_info);
1535 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1536 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1537 TREE_TYPE (DR_REF (dr))));
1538 if (DR_MISALIGNMENT (dr) != 0)
1539 npeel_tmp = (negative ? (mis - nelements)
1540 : (nelements - mis)) & (nelements - 1);
1542 /* For multiple types, it is possible that the bigger type access
1543 will have more than one peeling option. E.g., a loop with two
1544 types: one of size (vector size / 4), and the other one of
1545 size (vector size / 8). Vectorization factor will 8. If both
1546 accesses are misaligned by 3, the first one needs one scalar
1547 iteration to be aligned, and the second one needs 5. But the
1548 first one will be aligned also by peeling 5 scalar
1549 iterations, and in that case both accesses will be aligned.
1550 Hence, except for the immediate peeling amount, we also want
1551 to try to add full vector size, while we don't exceed
1552 vectorization factor.
1553 We do this automatically for cost model, since we calculate
1554 cost for every peeling option. */
1555 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1557 if (STMT_SLP_TYPE (stmt_info))
1558 possible_npeel_number
1559 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1560 else
1561 possible_npeel_number = vf / nelements;
1563 /* NPEEL_TMP is 0 when there is no misalignment, but also
1564 allow peeling NELEMENTS. */
1565 if (DR_MISALIGNMENT (dr) == 0)
1566 possible_npeel_number++;
1569 /* Save info about DR in the hash table. Also include peeling
1570 amounts according to the explanation above. */
1571 for (j = 0; j < possible_npeel_number; j++)
1573 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1574 dr, npeel_tmp);
1575 npeel_tmp += nelements;
1578 one_misalignment_known = true;
1580 else
1582 /* If we don't know any misalignment values, we prefer
1583 peeling for data-ref that has the maximum number of data-refs
1584 with the same alignment, unless the target prefers to align
1585 stores over load. */
1586 unsigned same_align_drs
1587 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1588 if (!dr0
1589 || same_align_drs_max < same_align_drs)
1591 same_align_drs_max = same_align_drs;
1592 dr0 = dr;
1594 /* For data-refs with the same number of related
1595 accesses prefer the one where the misalign
1596 computation will be invariant in the outermost loop. */
1597 else if (same_align_drs_max == same_align_drs)
1599 struct loop *ivloop0, *ivloop;
1600 ivloop0 = outermost_invariant_loop_for_expr
1601 (loop, DR_BASE_ADDRESS (dr0));
1602 ivloop = outermost_invariant_loop_for_expr
1603 (loop, DR_BASE_ADDRESS (dr));
1604 if ((ivloop && !ivloop0)
1605 || (ivloop && ivloop0
1606 && flow_loop_nested_p (ivloop, ivloop0)))
1607 dr0 = dr;
1610 one_misalignment_unknown = true;
1612 /* Check for data refs with unsupportable alignment that
1613 can be peeled. */
1614 if (!supportable_dr_alignment)
1616 one_dr_unsupportable = true;
1617 unsupportable_dr = dr;
1620 if (!first_store && DR_IS_WRITE (dr))
1621 first_store = dr;
1624 else
1626 if (!aligned_access_p (dr))
1628 if (dump_enabled_p ())
1629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630 "vector alignment may not be reachable\n");
1631 break;
1636 /* Check if we can possibly peel the loop. */
1637 if (!vect_can_advance_ivs_p (loop_vinfo)
1638 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1639 || loop->inner)
1640 do_peeling = false;
1642 struct _vect_peel_extended_info peel_for_known_alignment;
1643 struct _vect_peel_extended_info peel_for_unknown_alignment;
1644 struct _vect_peel_extended_info best_peel;
1646 peel_for_unknown_alignment.inside_cost = INT_MAX;
1647 peel_for_unknown_alignment.outside_cost = INT_MAX;
1648 peel_for_unknown_alignment.peel_info.count = 0;
1650 if (do_peeling
1651 && one_misalignment_unknown)
1653 /* Check if the target requires to prefer stores over loads, i.e., if
1654 misaligned stores are more expensive than misaligned loads (taking
1655 drs with same alignment into account). */
1656 unsigned int load_inside_cost = 0;
1657 unsigned int load_outside_cost = 0;
1658 unsigned int store_inside_cost = 0;
1659 unsigned int store_outside_cost = 0;
1661 stmt_vector_for_cost dummy;
1662 dummy.create (2);
1663 vect_get_peeling_costs_all_drs (dr0,
1664 &load_inside_cost,
1665 &load_outside_cost,
1666 &dummy, vf / 2, true);
1667 dummy.release ();
1669 if (first_store)
1671 dummy.create (2);
1672 vect_get_peeling_costs_all_drs (first_store,
1673 &store_inside_cost,
1674 &store_outside_cost,
1675 &dummy, vf / 2, true);
1676 dummy.release ();
1678 else
1680 store_inside_cost = INT_MAX;
1681 store_outside_cost = INT_MAX;
1684 if (load_inside_cost > store_inside_cost
1685 || (load_inside_cost == store_inside_cost
1686 && load_outside_cost > store_outside_cost))
1688 dr0 = first_store;
1689 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1690 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1692 else
1694 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1695 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1698 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1699 prologue_cost_vec.create (2);
1700 epilogue_cost_vec.create (2);
1702 int dummy2;
1703 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1704 (loop_vinfo, vf / 2, &dummy2,
1705 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1706 &prologue_cost_vec, &epilogue_cost_vec);
1708 prologue_cost_vec.release ();
1709 epilogue_cost_vec.release ();
1711 peel_for_unknown_alignment.peel_info.count = 1
1712 + STMT_VINFO_SAME_ALIGN_REFS
1713 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1716 peel_for_unknown_alignment.peel_info.npeel = 0;
1717 peel_for_unknown_alignment.peel_info.dr = dr0;
1719 best_peel = peel_for_unknown_alignment;
1721 peel_for_known_alignment.inside_cost = INT_MAX;
1722 peel_for_known_alignment.outside_cost = INT_MAX;
1723 peel_for_known_alignment.peel_info.count = 0;
1724 peel_for_known_alignment.peel_info.dr = NULL;
1726 if (do_peeling && one_misalignment_known)
1728 /* Peeling is possible, but there is no data access that is not supported
1729 unless aligned. So we try to choose the best possible peeling from
1730 the hash table. */
1731 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1732 (&peeling_htab, loop_vinfo, &npeel, &body_cost_vec);
1735 /* Compare costs of peeling for known and unknown alignment. */
1736 if (peel_for_known_alignment.peel_info.dr != NULL
1737 && peel_for_unknown_alignment.inside_cost
1738 >= peel_for_known_alignment.inside_cost)
1740 best_peel = peel_for_known_alignment;
1742 /* If the best peeling for known alignment has NPEEL == 0, perform no
1743 peeling at all except if there is an unsupportable dr that we can
1744 align. */
1745 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1746 do_peeling = false;
1749 /* If there is an unsupportable data ref, prefer this over all choices so far
1750 since we'd have to discard a chosen peeling except when it accidentally
1751 aligned the unsupportable data ref. */
1752 if (one_dr_unsupportable)
1753 dr0 = unsupportable_dr;
1754 else if (do_peeling)
1756 /* Calculate the penalty for no peeling, i.e. leaving everything
1757 unaligned.
1758 TODO: Adapt vect_get_peeling_costs_all_drs and use here. */
1759 unsigned nopeel_inside_cost = 0;
1760 unsigned nopeel_outside_cost = 0;
1762 stmt_vector_for_cost dummy;
1763 dummy.create (2);
1764 FOR_EACH_VEC_ELT (datarefs, i, dr)
1765 vect_get_data_access_cost (dr, &nopeel_inside_cost,
1766 &nopeel_outside_cost, &dummy);
1767 dummy.release ();
1769 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1770 costs will be recorded. */
1771 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1772 prologue_cost_vec.create (2);
1773 epilogue_cost_vec.create (2);
1775 int dummy2;
1776 nopeel_outside_cost += vect_get_known_peeling_cost
1777 (loop_vinfo, 0, &dummy2,
1778 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1779 &prologue_cost_vec, &epilogue_cost_vec);
1781 prologue_cost_vec.release ();
1782 epilogue_cost_vec.release ();
1784 npeel = best_peel.peel_info.npeel;
1785 dr0 = best_peel.peel_info.dr;
1787 /* If no peeling is not more expensive than the best peeling we
1788 have so far, don't perform any peeling. */
1789 if (nopeel_inside_cost <= best_peel.inside_cost)
1790 do_peeling = false;
1793 if (do_peeling)
1795 stmt = DR_STMT (dr0);
1796 stmt_info = vinfo_for_stmt (stmt);
1797 vectype = STMT_VINFO_VECTYPE (stmt_info);
1798 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1800 if (known_alignment_for_access_p (dr0))
1802 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1803 size_zero_node) < 0;
1804 if (!npeel)
1806 /* Since it's known at compile time, compute the number of
1807 iterations in the peeled loop (the peeling factor) for use in
1808 updating DR_MISALIGNMENT values. The peeling factor is the
1809 vectorization factor minus the misalignment as an element
1810 count. */
1811 mis = DR_MISALIGNMENT (dr0);
1812 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1813 npeel = ((negative ? mis - nelements : nelements - mis)
1814 & (nelements - 1));
1817 /* For interleaved data access every iteration accesses all the
1818 members of the group, therefore we divide the number of iterations
1819 by the group size. */
1820 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1821 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1822 npeel /= GROUP_SIZE (stmt_info);
1824 if (dump_enabled_p ())
1825 dump_printf_loc (MSG_NOTE, vect_location,
1826 "Try peeling by %d\n", npeel);
1829 /* Ensure that all datarefs can be vectorized after the peel. */
1830 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1831 do_peeling = false;
1833 /* Check if all datarefs are supportable and log. */
1834 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1836 stat = vect_verify_datarefs_alignment (loop_vinfo);
1837 if (!stat)
1838 do_peeling = false;
1839 else
1841 body_cost_vec.release ();
1842 return stat;
1846 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1847 if (do_peeling)
1849 unsigned max_allowed_peel
1850 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1851 if (max_allowed_peel != (unsigned)-1)
1853 unsigned max_peel = npeel;
1854 if (max_peel == 0)
1856 gimple *dr_stmt = DR_STMT (dr0);
1857 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1858 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1859 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1861 if (max_peel > max_allowed_peel)
1863 do_peeling = false;
1864 if (dump_enabled_p ())
1865 dump_printf_loc (MSG_NOTE, vect_location,
1866 "Disable peeling, max peels reached: %d\n", max_peel);
1871 /* Cost model #2 - if peeling may result in a remaining loop not
1872 iterating enough to be vectorized then do not peel. */
1873 if (do_peeling
1874 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1876 unsigned max_peel
1877 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1878 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1879 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1880 do_peeling = false;
1883 if (do_peeling)
1885 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1886 If the misalignment of DR_i is identical to that of dr0 then set
1887 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1888 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1889 by the peeling factor times the element size of DR_i (MOD the
1890 vectorization factor times the size). Otherwise, the
1891 misalignment of DR_i must be set to unknown. */
1892 FOR_EACH_VEC_ELT (datarefs, i, dr)
1893 if (dr != dr0)
1895 /* Strided accesses perform only component accesses, alignment
1896 is irrelevant for them. */
1897 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1898 if (STMT_VINFO_STRIDED_P (stmt_info)
1899 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1900 continue;
1902 vect_update_misalignment_for_peel (dr, dr0, npeel);
1905 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1906 if (npeel)
1907 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1908 else
1909 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1910 = DR_MISALIGNMENT (dr0);
1911 SET_DR_MISALIGNMENT (dr0, 0);
1912 if (dump_enabled_p ())
1914 dump_printf_loc (MSG_NOTE, vect_location,
1915 "Alignment of access forced using peeling.\n");
1916 dump_printf_loc (MSG_NOTE, vect_location,
1917 "Peeling for alignment will be applied.\n");
1919 /* The inside-loop cost will be accounted for in vectorizable_load
1920 and vectorizable_store correctly with adjusted alignments.
1921 Drop the body_cst_vec on the floor here. */
1922 body_cost_vec.release ();
1924 stat = vect_verify_datarefs_alignment (loop_vinfo);
1925 gcc_assert (stat);
1926 return stat;
1930 body_cost_vec.release ();
1932 /* (2) Versioning to force alignment. */
1934 /* Try versioning if:
1935 1) optimize loop for speed
1936 2) there is at least one unsupported misaligned data ref with an unknown
1937 misalignment, and
1938 3) all misaligned data refs with a known misalignment are supported, and
1939 4) the number of runtime alignment checks is within reason. */
1941 do_versioning =
1942 optimize_loop_nest_for_speed_p (loop)
1943 && (!loop->inner); /* FORNOW */
1945 if (do_versioning)
1947 FOR_EACH_VEC_ELT (datarefs, i, dr)
1949 stmt = DR_STMT (dr);
1950 stmt_info = vinfo_for_stmt (stmt);
1952 /* For interleaving, only the alignment of the first access
1953 matters. */
1954 if (aligned_access_p (dr)
1955 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1956 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1957 continue;
1959 if (STMT_VINFO_STRIDED_P (stmt_info))
1961 /* Strided loads perform only component accesses, alignment is
1962 irrelevant for them. */
1963 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1964 continue;
1965 do_versioning = false;
1966 break;
1969 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1971 if (!supportable_dr_alignment)
1973 gimple *stmt;
1974 int mask;
1975 tree vectype;
1977 if (known_alignment_for_access_p (dr)
1978 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1979 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1981 do_versioning = false;
1982 break;
1985 stmt = DR_STMT (dr);
1986 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1987 gcc_assert (vectype);
1989 /* The rightmost bits of an aligned address must be zeros.
1990 Construct the mask needed for this test. For example,
1991 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1992 mask must be 15 = 0xf. */
1993 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1995 /* FORNOW: use the same mask to test all potentially unaligned
1996 references in the loop. The vectorizer currently supports
1997 a single vector size, see the reference to
1998 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1999 vectorization factor is computed. */
2000 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2001 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2002 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2003 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2004 DR_STMT (dr));
2008 /* Versioning requires at least one misaligned data reference. */
2009 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2010 do_versioning = false;
2011 else if (!do_versioning)
2012 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2015 if (do_versioning)
2017 vec<gimple *> may_misalign_stmts
2018 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2019 gimple *stmt;
2021 /* It can now be assumed that the data references in the statements
2022 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2023 of the loop being vectorized. */
2024 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2026 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2027 dr = STMT_VINFO_DATA_REF (stmt_info);
2028 SET_DR_MISALIGNMENT (dr, 0);
2029 if (dump_enabled_p ())
2030 dump_printf_loc (MSG_NOTE, vect_location,
2031 "Alignment of access forced using versioning.\n");
2034 if (dump_enabled_p ())
2035 dump_printf_loc (MSG_NOTE, vect_location,
2036 "Versioning for alignment will be applied.\n");
2038 /* Peeling and versioning can't be done together at this time. */
2039 gcc_assert (! (do_peeling && do_versioning));
2041 stat = vect_verify_datarefs_alignment (loop_vinfo);
2042 gcc_assert (stat);
2043 return stat;
2046 /* This point is reached if neither peeling nor versioning is being done. */
2047 gcc_assert (! (do_peeling || do_versioning));
2049 stat = vect_verify_datarefs_alignment (loop_vinfo);
2050 return stat;
2054 /* Function vect_find_same_alignment_drs.
2056 Update group and alignment relations according to the chosen
2057 vectorization factor. */
2059 static void
2060 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2062 struct data_reference *dra = DDR_A (ddr);
2063 struct data_reference *drb = DDR_B (ddr);
2064 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2065 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2067 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2068 return;
2070 if (dra == drb)
2071 return;
2073 if (!operand_equal_p (DR_BASE_OBJECT (dra), DR_BASE_OBJECT (drb),
2074 OEP_ADDRESS_OF)
2075 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2076 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2077 return;
2079 /* Two references with distance zero have the same alignment. */
2080 offset_int diff = (wi::to_offset (DR_INIT (dra))
2081 - wi::to_offset (DR_INIT (drb)));
2082 if (diff != 0)
2084 /* Get the wider of the two alignments. */
2085 unsigned int align_a = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_a));
2086 unsigned int align_b = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_b));
2087 unsigned int max_align = MAX (align_a, align_b);
2089 /* Require the gap to be a multiple of the larger vector alignment. */
2090 if (!wi::multiple_of_p (diff, max_align, SIGNED))
2091 return;
2094 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2095 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2096 if (dump_enabled_p ())
2098 dump_printf_loc (MSG_NOTE, vect_location,
2099 "accesses have the same alignment: ");
2100 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2101 dump_printf (MSG_NOTE, " and ");
2102 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2103 dump_printf (MSG_NOTE, "\n");
2108 /* Function vect_analyze_data_refs_alignment
2110 Analyze the alignment of the data-references in the loop.
2111 Return FALSE if a data reference is found that cannot be vectorized. */
2113 bool
2114 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2116 if (dump_enabled_p ())
2117 dump_printf_loc (MSG_NOTE, vect_location,
2118 "=== vect_analyze_data_refs_alignment ===\n");
2120 /* Mark groups of data references with same alignment using
2121 data dependence information. */
2122 vec<ddr_p> ddrs = vinfo->ddrs;
2123 struct data_dependence_relation *ddr;
2124 unsigned int i;
2126 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2127 vect_find_same_alignment_drs (ddr);
2129 vec<data_reference_p> datarefs = vinfo->datarefs;
2130 struct data_reference *dr;
2132 FOR_EACH_VEC_ELT (datarefs, i, dr)
2134 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2135 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2136 && !vect_compute_data_ref_alignment (dr))
2138 /* Strided accesses perform only component accesses, misalignment
2139 information is irrelevant for them. */
2140 if (STMT_VINFO_STRIDED_P (stmt_info)
2141 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2142 continue;
2144 if (dump_enabled_p ())
2145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2146 "not vectorized: can't calculate alignment "
2147 "for data ref.\n");
2149 return false;
2153 return true;
2157 /* Analyze alignment of DRs of stmts in NODE. */
2159 static bool
2160 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2162 /* We vectorize from the first scalar stmt in the node unless
2163 the node is permuted in which case we start from the first
2164 element in the group. */
2165 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2166 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2167 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2168 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2170 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2171 if (! vect_compute_data_ref_alignment (dr)
2172 /* For creating the data-ref pointer we need alignment of the
2173 first element anyway. */
2174 || (dr != first_dr
2175 && ! vect_compute_data_ref_alignment (first_dr))
2176 || ! verify_data_ref_alignment (dr))
2178 if (dump_enabled_p ())
2179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2180 "not vectorized: bad data alignment in basic "
2181 "block.\n");
2182 return false;
2185 return true;
2188 /* Function vect_slp_analyze_instance_alignment
2190 Analyze the alignment of the data-references in the SLP instance.
2191 Return FALSE if a data reference is found that cannot be vectorized. */
2193 bool
2194 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2196 if (dump_enabled_p ())
2197 dump_printf_loc (MSG_NOTE, vect_location,
2198 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2200 slp_tree node;
2201 unsigned i;
2202 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2203 if (! vect_slp_analyze_and_verify_node_alignment (node))
2204 return false;
2206 node = SLP_INSTANCE_TREE (instance);
2207 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2208 && ! vect_slp_analyze_and_verify_node_alignment
2209 (SLP_INSTANCE_TREE (instance)))
2210 return false;
2212 return true;
2216 /* Analyze groups of accesses: check that DR belongs to a group of
2217 accesses of legal size, step, etc. Detect gaps, single element
2218 interleaving, and other special cases. Set grouped access info.
2219 Collect groups of strided stores for further use in SLP analysis.
2220 Worker for vect_analyze_group_access. */
2222 static bool
2223 vect_analyze_group_access_1 (struct data_reference *dr)
2225 tree step = DR_STEP (dr);
2226 tree scalar_type = TREE_TYPE (DR_REF (dr));
2227 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2228 gimple *stmt = DR_STMT (dr);
2229 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2230 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2231 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2232 HOST_WIDE_INT dr_step = -1;
2233 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2234 bool slp_impossible = false;
2236 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2237 size of the interleaving group (including gaps). */
2238 if (tree_fits_shwi_p (step))
2240 dr_step = tree_to_shwi (step);
2241 /* Check that STEP is a multiple of type size. Otherwise there is
2242 a non-element-sized gap at the end of the group which we
2243 cannot represent in GROUP_GAP or GROUP_SIZE.
2244 ??? As we can handle non-constant step fine here we should
2245 simply remove uses of GROUP_GAP between the last and first
2246 element and instead rely on DR_STEP. GROUP_SIZE then would
2247 simply not include that gap. */
2248 if ((dr_step % type_size) != 0)
2250 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_NOTE, vect_location,
2253 "Step ");
2254 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2255 dump_printf (MSG_NOTE,
2256 " is not a multiple of the element size for ");
2257 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2258 dump_printf (MSG_NOTE, "\n");
2260 return false;
2262 groupsize = absu_hwi (dr_step) / type_size;
2264 else
2265 groupsize = 0;
2267 /* Not consecutive access is possible only if it is a part of interleaving. */
2268 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2270 /* Check if it this DR is a part of interleaving, and is a single
2271 element of the group that is accessed in the loop. */
2273 /* Gaps are supported only for loads. STEP must be a multiple of the type
2274 size. The size of the group must be a power of 2. */
2275 if (DR_IS_READ (dr)
2276 && (dr_step % type_size) == 0
2277 && groupsize > 0
2278 && pow2p_hwi (groupsize))
2280 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2281 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2282 GROUP_GAP (stmt_info) = groupsize - 1;
2283 if (dump_enabled_p ())
2285 dump_printf_loc (MSG_NOTE, vect_location,
2286 "Detected single element interleaving ");
2287 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2288 dump_printf (MSG_NOTE, " step ");
2289 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2290 dump_printf (MSG_NOTE, "\n");
2293 return true;
2296 if (dump_enabled_p ())
2298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2299 "not consecutive access ");
2300 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2303 if (bb_vinfo)
2305 /* Mark the statement as unvectorizable. */
2306 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2307 return true;
2310 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2311 STMT_VINFO_STRIDED_P (stmt_info) = true;
2312 return true;
2315 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2317 /* First stmt in the interleaving chain. Check the chain. */
2318 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2319 struct data_reference *data_ref = dr;
2320 unsigned int count = 1;
2321 tree prev_init = DR_INIT (data_ref);
2322 gimple *prev = stmt;
2323 HOST_WIDE_INT diff, gaps = 0;
2325 while (next)
2327 /* Skip same data-refs. In case that two or more stmts share
2328 data-ref (supported only for loads), we vectorize only the first
2329 stmt, and the rest get their vectorized loads from the first
2330 one. */
2331 if (!tree_int_cst_compare (DR_INIT (data_ref),
2332 DR_INIT (STMT_VINFO_DATA_REF (
2333 vinfo_for_stmt (next)))))
2335 if (DR_IS_WRITE (data_ref))
2337 if (dump_enabled_p ())
2338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2339 "Two store stmts share the same dr.\n");
2340 return false;
2343 if (dump_enabled_p ())
2344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2345 "Two or more load stmts share the same dr.\n");
2347 /* For load use the same data-ref load. */
2348 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2350 prev = next;
2351 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2352 continue;
2355 prev = next;
2356 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2358 /* All group members have the same STEP by construction. */
2359 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2361 /* Check that the distance between two accesses is equal to the type
2362 size. Otherwise, we have gaps. */
2363 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2364 - TREE_INT_CST_LOW (prev_init)) / type_size;
2365 if (diff != 1)
2367 /* FORNOW: SLP of accesses with gaps is not supported. */
2368 slp_impossible = true;
2369 if (DR_IS_WRITE (data_ref))
2371 if (dump_enabled_p ())
2372 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2373 "interleaved store with gaps\n");
2374 return false;
2377 gaps += diff - 1;
2380 last_accessed_element += diff;
2382 /* Store the gap from the previous member of the group. If there is no
2383 gap in the access, GROUP_GAP is always 1. */
2384 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2386 prev_init = DR_INIT (data_ref);
2387 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2388 /* Count the number of data-refs in the chain. */
2389 count++;
2392 if (groupsize == 0)
2393 groupsize = count + gaps;
2395 /* This could be UINT_MAX but as we are generating code in a very
2396 inefficient way we have to cap earlier. See PR78699 for example. */
2397 if (groupsize > 4096)
2399 if (dump_enabled_p ())
2400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2401 "group is too large\n");
2402 return false;
2405 /* Check that the size of the interleaving is equal to count for stores,
2406 i.e., that there are no gaps. */
2407 if (groupsize != count
2408 && !DR_IS_READ (dr))
2410 if (dump_enabled_p ())
2411 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2412 "interleaved store with gaps\n");
2413 return false;
2416 /* If there is a gap after the last load in the group it is the
2417 difference between the groupsize and the last accessed
2418 element.
2419 When there is no gap, this difference should be 0. */
2420 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2422 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2423 if (dump_enabled_p ())
2425 dump_printf_loc (MSG_NOTE, vect_location,
2426 "Detected interleaving ");
2427 if (DR_IS_READ (dr))
2428 dump_printf (MSG_NOTE, "load ");
2429 else
2430 dump_printf (MSG_NOTE, "store ");
2431 dump_printf (MSG_NOTE, "of size %u starting with ",
2432 (unsigned)groupsize);
2433 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2434 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2435 dump_printf_loc (MSG_NOTE, vect_location,
2436 "There is a gap of %u elements after the group\n",
2437 GROUP_GAP (vinfo_for_stmt (stmt)));
2440 /* SLP: create an SLP data structure for every interleaving group of
2441 stores for further analysis in vect_analyse_slp. */
2442 if (DR_IS_WRITE (dr) && !slp_impossible)
2444 if (loop_vinfo)
2445 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2446 if (bb_vinfo)
2447 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2451 return true;
2454 /* Analyze groups of accesses: check that DR belongs to a group of
2455 accesses of legal size, step, etc. Detect gaps, single element
2456 interleaving, and other special cases. Set grouped access info.
2457 Collect groups of strided stores for further use in SLP analysis. */
2459 static bool
2460 vect_analyze_group_access (struct data_reference *dr)
2462 if (!vect_analyze_group_access_1 (dr))
2464 /* Dissolve the group if present. */
2465 gimple *next;
2466 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2467 while (stmt)
2469 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2470 next = GROUP_NEXT_ELEMENT (vinfo);
2471 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2472 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2473 stmt = next;
2475 return false;
2477 return true;
2480 /* Analyze the access pattern of the data-reference DR.
2481 In case of non-consecutive accesses call vect_analyze_group_access() to
2482 analyze groups of accesses. */
2484 static bool
2485 vect_analyze_data_ref_access (struct data_reference *dr)
2487 tree step = DR_STEP (dr);
2488 tree scalar_type = TREE_TYPE (DR_REF (dr));
2489 gimple *stmt = DR_STMT (dr);
2490 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2491 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2492 struct loop *loop = NULL;
2494 if (loop_vinfo)
2495 loop = LOOP_VINFO_LOOP (loop_vinfo);
2497 if (loop_vinfo && !step)
2499 if (dump_enabled_p ())
2500 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2501 "bad data-ref access in loop\n");
2502 return false;
2505 /* Allow loads with zero step in inner-loop vectorization. */
2506 if (loop_vinfo && integer_zerop (step))
2508 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2509 if (!nested_in_vect_loop_p (loop, stmt))
2510 return DR_IS_READ (dr);
2511 /* Allow references with zero step for outer loops marked
2512 with pragma omp simd only - it guarantees absence of
2513 loop-carried dependencies between inner loop iterations. */
2514 if (!loop->force_vectorize)
2516 if (dump_enabled_p ())
2517 dump_printf_loc (MSG_NOTE, vect_location,
2518 "zero step in inner loop of nest\n");
2519 return false;
2523 if (loop && nested_in_vect_loop_p (loop, stmt))
2525 /* Interleaved accesses are not yet supported within outer-loop
2526 vectorization for references in the inner-loop. */
2527 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2529 /* For the rest of the analysis we use the outer-loop step. */
2530 step = STMT_VINFO_DR_STEP (stmt_info);
2531 if (integer_zerop (step))
2533 if (dump_enabled_p ())
2534 dump_printf_loc (MSG_NOTE, vect_location,
2535 "zero step in outer loop.\n");
2536 return DR_IS_READ (dr);
2540 /* Consecutive? */
2541 if (TREE_CODE (step) == INTEGER_CST)
2543 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2544 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2545 || (dr_step < 0
2546 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2548 /* Mark that it is not interleaving. */
2549 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2550 return true;
2554 if (loop && nested_in_vect_loop_p (loop, stmt))
2556 if (dump_enabled_p ())
2557 dump_printf_loc (MSG_NOTE, vect_location,
2558 "grouped access in outer loop.\n");
2559 return false;
2563 /* Assume this is a DR handled by non-constant strided load case. */
2564 if (TREE_CODE (step) != INTEGER_CST)
2565 return (STMT_VINFO_STRIDED_P (stmt_info)
2566 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2567 || vect_analyze_group_access (dr)));
2569 /* Not consecutive access - check if it's a part of interleaving group. */
2570 return vect_analyze_group_access (dr);
2573 /* Compare two data-references DRA and DRB to group them into chunks
2574 suitable for grouping. */
2576 static int
2577 dr_group_sort_cmp (const void *dra_, const void *drb_)
2579 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2580 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2581 int cmp;
2583 /* Stabilize sort. */
2584 if (dra == drb)
2585 return 0;
2587 /* DRs in different loops never belong to the same group. */
2588 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2589 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2590 if (loopa != loopb)
2591 return loopa->num < loopb->num ? -1 : 1;
2593 /* Ordering of DRs according to base. */
2594 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2596 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2597 DR_BASE_ADDRESS (drb));
2598 if (cmp != 0)
2599 return cmp;
2602 /* And according to DR_OFFSET. */
2603 if (!dr_equal_offsets_p (dra, drb))
2605 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2606 if (cmp != 0)
2607 return cmp;
2610 /* Put reads before writes. */
2611 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2612 return DR_IS_READ (dra) ? -1 : 1;
2614 /* Then sort after access size. */
2615 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2616 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2618 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2619 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2620 if (cmp != 0)
2621 return cmp;
2624 /* And after step. */
2625 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2627 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2628 if (cmp != 0)
2629 return cmp;
2632 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2633 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2634 if (cmp == 0)
2635 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2636 return cmp;
2639 /* Function vect_analyze_data_ref_accesses.
2641 Analyze the access pattern of all the data references in the loop.
2643 FORNOW: the only access pattern that is considered vectorizable is a
2644 simple step 1 (consecutive) access.
2646 FORNOW: handle only arrays and pointer accesses. */
2648 bool
2649 vect_analyze_data_ref_accesses (vec_info *vinfo)
2651 unsigned int i;
2652 vec<data_reference_p> datarefs = vinfo->datarefs;
2653 struct data_reference *dr;
2655 if (dump_enabled_p ())
2656 dump_printf_loc (MSG_NOTE, vect_location,
2657 "=== vect_analyze_data_ref_accesses ===\n");
2659 if (datarefs.is_empty ())
2660 return true;
2662 /* Sort the array of datarefs to make building the interleaving chains
2663 linear. Don't modify the original vector's order, it is needed for
2664 determining what dependencies are reversed. */
2665 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2666 datarefs_copy.qsort (dr_group_sort_cmp);
2668 /* Build the interleaving chains. */
2669 for (i = 0; i < datarefs_copy.length () - 1;)
2671 data_reference_p dra = datarefs_copy[i];
2672 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2673 stmt_vec_info lastinfo = NULL;
2674 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2676 ++i;
2677 continue;
2679 for (i = i + 1; i < datarefs_copy.length (); ++i)
2681 data_reference_p drb = datarefs_copy[i];
2682 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2683 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2684 break;
2686 /* ??? Imperfect sorting (non-compatible types, non-modulo
2687 accesses, same accesses) can lead to a group to be artificially
2688 split here as we don't just skip over those. If it really
2689 matters we can push those to a worklist and re-iterate
2690 over them. The we can just skip ahead to the next DR here. */
2692 /* DRs in a different loop should not be put into the same
2693 interleaving group. */
2694 if (gimple_bb (DR_STMT (dra))->loop_father
2695 != gimple_bb (DR_STMT (drb))->loop_father)
2696 break;
2698 /* Check that the data-refs have same first location (except init)
2699 and they are both either store or load (not load and store,
2700 not masked loads or stores). */
2701 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2702 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2703 DR_BASE_ADDRESS (drb), 0)
2704 || !dr_equal_offsets_p (dra, drb)
2705 || !gimple_assign_single_p (DR_STMT (dra))
2706 || !gimple_assign_single_p (DR_STMT (drb)))
2707 break;
2709 /* Check that the data-refs have the same constant size. */
2710 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2711 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2712 if (!tree_fits_uhwi_p (sza)
2713 || !tree_fits_uhwi_p (szb)
2714 || !tree_int_cst_equal (sza, szb))
2715 break;
2717 /* Check that the data-refs have the same step. */
2718 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2719 break;
2721 /* Do not place the same access in the interleaving chain twice. */
2722 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2723 break;
2725 /* Check the types are compatible.
2726 ??? We don't distinguish this during sorting. */
2727 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2728 TREE_TYPE (DR_REF (drb))))
2729 break;
2731 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2732 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2733 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2734 gcc_assert (init_a <= init_b);
2736 /* If init_b == init_a + the size of the type * k, we have an
2737 interleaving, and DRA is accessed before DRB. */
2738 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2739 if (type_size_a == 0
2740 || (init_b - init_a) % type_size_a != 0)
2741 break;
2743 /* If we have a store, the accesses are adjacent. This splits
2744 groups into chunks we support (we don't support vectorization
2745 of stores with gaps). */
2746 if (!DR_IS_READ (dra)
2747 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2748 (DR_INIT (datarefs_copy[i-1]))
2749 != type_size_a))
2750 break;
2752 /* If the step (if not zero or non-constant) is greater than the
2753 difference between data-refs' inits this splits groups into
2754 suitable sizes. */
2755 if (tree_fits_shwi_p (DR_STEP (dra)))
2757 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2758 if (step != 0 && step <= (init_b - init_a))
2759 break;
2762 if (dump_enabled_p ())
2764 dump_printf_loc (MSG_NOTE, vect_location,
2765 "Detected interleaving ");
2766 if (DR_IS_READ (dra))
2767 dump_printf (MSG_NOTE, "load ");
2768 else
2769 dump_printf (MSG_NOTE, "store ");
2770 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2771 dump_printf (MSG_NOTE, " and ");
2772 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2773 dump_printf (MSG_NOTE, "\n");
2776 /* Link the found element into the group list. */
2777 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2779 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2780 lastinfo = stmtinfo_a;
2782 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2783 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2784 lastinfo = stmtinfo_b;
2788 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2789 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2790 && !vect_analyze_data_ref_access (dr))
2792 if (dump_enabled_p ())
2793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2794 "not vectorized: complicated access pattern.\n");
2796 if (is_a <bb_vec_info> (vinfo))
2798 /* Mark the statement as not vectorizable. */
2799 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2800 continue;
2802 else
2804 datarefs_copy.release ();
2805 return false;
2809 datarefs_copy.release ();
2810 return true;
2813 /* Function vect_vfa_segment_size.
2815 Create an expression that computes the size of segment
2816 that will be accessed for a data reference. The functions takes into
2817 account that realignment loads may access one more vector.
2819 Input:
2820 DR: The data reference.
2821 LENGTH_FACTOR: segment length to consider.
2823 Return an expression whose value is the size of segment which will be
2824 accessed by DR. */
2826 static tree
2827 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2829 tree segment_length;
2831 if (integer_zerop (DR_STEP (dr)))
2832 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2833 else
2834 segment_length = size_binop (MULT_EXPR,
2835 fold_convert (sizetype, DR_STEP (dr)),
2836 fold_convert (sizetype, length_factor));
2838 if (vect_supportable_dr_alignment (dr, false)
2839 == dr_explicit_realign_optimized)
2841 tree vector_size = TYPE_SIZE_UNIT
2842 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2844 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2846 return segment_length;
2849 /* Function vect_no_alias_p.
2851 Given data references A and B with equal base and offset, the alias
2852 relation can be decided at compilation time, return TRUE if they do
2853 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2854 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2855 respectively. */
2857 static bool
2858 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2859 tree segment_length_a, tree segment_length_b)
2861 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2862 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2863 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2864 return false;
2866 tree seg_a_min = DR_INIT (a);
2867 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2868 seg_a_min, segment_length_a);
2869 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2870 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2871 [a, a+12) */
2872 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2874 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2875 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2876 seg_a_max, unit_size);
2877 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2878 DR_INIT (a), unit_size);
2880 tree seg_b_min = DR_INIT (b);
2881 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
2882 seg_b_min, segment_length_b);
2883 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
2885 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
2886 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
2887 seg_b_max, unit_size);
2888 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
2889 DR_INIT (b), unit_size);
2892 if (tree_int_cst_le (seg_a_max, seg_b_min)
2893 || tree_int_cst_le (seg_b_max, seg_a_min))
2894 return true;
2896 return false;
2899 /* Function vect_prune_runtime_alias_test_list.
2901 Prune a list of ddrs to be tested at run-time by versioning for alias.
2902 Merge several alias checks into one if possible.
2903 Return FALSE if resulting list of ddrs is longer then allowed by
2904 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2906 bool
2907 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2909 vec<ddr_p> may_alias_ddrs =
2910 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2911 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2912 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2913 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2914 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2916 ddr_p ddr;
2917 unsigned int i;
2918 tree length_factor;
2920 if (dump_enabled_p ())
2921 dump_printf_loc (MSG_NOTE, vect_location,
2922 "=== vect_prune_runtime_alias_test_list ===\n");
2924 if (may_alias_ddrs.is_empty ())
2925 return true;
2927 comp_alias_ddrs.create (may_alias_ddrs.length ());
2929 /* First, we collect all data ref pairs for aliasing checks. */
2930 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2932 int comp_res;
2933 struct data_reference *dr_a, *dr_b;
2934 gimple *dr_group_first_a, *dr_group_first_b;
2935 tree segment_length_a, segment_length_b;
2936 gimple *stmt_a, *stmt_b;
2938 dr_a = DDR_A (ddr);
2939 stmt_a = DR_STMT (DDR_A (ddr));
2940 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2941 if (dr_group_first_a)
2943 stmt_a = dr_group_first_a;
2944 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2947 dr_b = DDR_B (ddr);
2948 stmt_b = DR_STMT (DDR_B (ddr));
2949 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2950 if (dr_group_first_b)
2952 stmt_b = dr_group_first_b;
2953 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2956 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2957 length_factor = scalar_loop_iters;
2958 else
2959 length_factor = size_int (vect_factor);
2960 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2961 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2963 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2964 DR_BASE_ADDRESS (dr_b));
2965 if (comp_res == 0)
2966 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
2967 DR_OFFSET (dr_b));
2969 /* Alias is known at compilation time. */
2970 if (comp_res == 0
2971 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
2972 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
2973 && TREE_CODE (segment_length_a) == INTEGER_CST
2974 && TREE_CODE (segment_length_b) == INTEGER_CST)
2976 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
2977 continue;
2979 if (dump_enabled_p ())
2980 dump_printf_loc (MSG_NOTE, vect_location,
2981 "not vectorized: compilation time alias.\n");
2983 return false;
2986 dr_with_seg_len_pair_t dr_with_seg_len_pair
2987 (dr_with_seg_len (dr_a, segment_length_a),
2988 dr_with_seg_len (dr_b, segment_length_b));
2990 /* Canonicalize pairs by sorting the two DR members. */
2991 if (comp_res > 0)
2992 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2994 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2997 prune_runtime_alias_test_list (&comp_alias_ddrs,
2998 (unsigned HOST_WIDE_INT) vect_factor);
2999 dump_printf_loc (MSG_NOTE, vect_location,
3000 "improved number of alias checks from %d to %d\n",
3001 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3002 if ((int) comp_alias_ddrs.length () >
3003 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3005 if (dump_enabled_p ())
3006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3007 "number of versioning for alias "
3008 "run-time tests exceeds %d "
3009 "(--param vect-max-version-for-alias-checks)\n",
3010 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3011 return false;
3014 /* All alias checks have been resolved at compilation time. */
3015 if (!comp_alias_ddrs.length ())
3016 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3018 return true;
3021 /* Return true if a non-affine read or write in STMT is suitable for a
3022 gather load or scatter store. Describe the operation in *INFO if so. */
3024 bool
3025 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3026 gather_scatter_info *info)
3028 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3029 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3030 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3031 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3032 tree offtype = NULL_TREE;
3033 tree decl, base, off;
3034 machine_mode pmode;
3035 int punsignedp, reversep, pvolatilep = 0;
3037 base = DR_REF (dr);
3038 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3039 see if we can use the def stmt of the address. */
3040 if (is_gimple_call (stmt)
3041 && gimple_call_internal_p (stmt)
3042 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3043 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3044 && TREE_CODE (base) == MEM_REF
3045 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3046 && integer_zerop (TREE_OPERAND (base, 1))
3047 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3049 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3050 if (is_gimple_assign (def_stmt)
3051 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3052 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3055 /* The gather and scatter builtins need address of the form
3056 loop_invariant + vector * {1, 2, 4, 8}
3058 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3059 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3060 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3061 multiplications and additions in it. To get a vector, we need
3062 a single SSA_NAME that will be defined in the loop and will
3063 contain everything that is not loop invariant and that can be
3064 vectorized. The following code attempts to find such a preexistng
3065 SSA_NAME OFF and put the loop invariants into a tree BASE
3066 that can be gimplified before the loop. */
3067 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3068 &punsignedp, &reversep, &pvolatilep);
3069 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3071 if (TREE_CODE (base) == MEM_REF)
3073 if (!integer_zerop (TREE_OPERAND (base, 1)))
3075 if (off == NULL_TREE)
3077 offset_int moff = mem_ref_offset (base);
3078 off = wide_int_to_tree (sizetype, moff);
3080 else
3081 off = size_binop (PLUS_EXPR, off,
3082 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3084 base = TREE_OPERAND (base, 0);
3086 else
3087 base = build_fold_addr_expr (base);
3089 if (off == NULL_TREE)
3090 off = size_zero_node;
3092 /* If base is not loop invariant, either off is 0, then we start with just
3093 the constant offset in the loop invariant BASE and continue with base
3094 as OFF, otherwise give up.
3095 We could handle that case by gimplifying the addition of base + off
3096 into some SSA_NAME and use that as off, but for now punt. */
3097 if (!expr_invariant_in_loop_p (loop, base))
3099 if (!integer_zerop (off))
3100 return false;
3101 off = base;
3102 base = size_int (pbitpos / BITS_PER_UNIT);
3104 /* Otherwise put base + constant offset into the loop invariant BASE
3105 and continue with OFF. */
3106 else
3108 base = fold_convert (sizetype, base);
3109 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3112 /* OFF at this point may be either a SSA_NAME or some tree expression
3113 from get_inner_reference. Try to peel off loop invariants from it
3114 into BASE as long as possible. */
3115 STRIP_NOPS (off);
3116 while (offtype == NULL_TREE)
3118 enum tree_code code;
3119 tree op0, op1, add = NULL_TREE;
3121 if (TREE_CODE (off) == SSA_NAME)
3123 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3125 if (expr_invariant_in_loop_p (loop, off))
3126 return false;
3128 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3129 break;
3131 op0 = gimple_assign_rhs1 (def_stmt);
3132 code = gimple_assign_rhs_code (def_stmt);
3133 op1 = gimple_assign_rhs2 (def_stmt);
3135 else
3137 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3138 return false;
3139 code = TREE_CODE (off);
3140 extract_ops_from_tree (off, &code, &op0, &op1);
3142 switch (code)
3144 case POINTER_PLUS_EXPR:
3145 case PLUS_EXPR:
3146 if (expr_invariant_in_loop_p (loop, op0))
3148 add = op0;
3149 off = op1;
3150 do_add:
3151 add = fold_convert (sizetype, add);
3152 if (scale != 1)
3153 add = size_binop (MULT_EXPR, add, size_int (scale));
3154 base = size_binop (PLUS_EXPR, base, add);
3155 continue;
3157 if (expr_invariant_in_loop_p (loop, op1))
3159 add = op1;
3160 off = op0;
3161 goto do_add;
3163 break;
3164 case MINUS_EXPR:
3165 if (expr_invariant_in_loop_p (loop, op1))
3167 add = fold_convert (sizetype, op1);
3168 add = size_binop (MINUS_EXPR, size_zero_node, add);
3169 off = op0;
3170 goto do_add;
3172 break;
3173 case MULT_EXPR:
3174 if (scale == 1 && tree_fits_shwi_p (op1))
3176 scale = tree_to_shwi (op1);
3177 off = op0;
3178 continue;
3180 break;
3181 case SSA_NAME:
3182 off = op0;
3183 continue;
3184 CASE_CONVERT:
3185 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3186 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3187 break;
3188 if (TYPE_PRECISION (TREE_TYPE (op0))
3189 == TYPE_PRECISION (TREE_TYPE (off)))
3191 off = op0;
3192 continue;
3194 if (TYPE_PRECISION (TREE_TYPE (op0))
3195 < TYPE_PRECISION (TREE_TYPE (off)))
3197 off = op0;
3198 offtype = TREE_TYPE (off);
3199 STRIP_NOPS (off);
3200 continue;
3202 break;
3203 default:
3204 break;
3206 break;
3209 /* If at the end OFF still isn't a SSA_NAME or isn't
3210 defined in the loop, punt. */
3211 if (TREE_CODE (off) != SSA_NAME
3212 || expr_invariant_in_loop_p (loop, off))
3213 return false;
3215 if (offtype == NULL_TREE)
3216 offtype = TREE_TYPE (off);
3218 if (DR_IS_READ (dr))
3219 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3220 offtype, scale);
3221 else
3222 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3223 offtype, scale);
3225 if (decl == NULL_TREE)
3226 return false;
3228 info->decl = decl;
3229 info->base = base;
3230 info->offset = off;
3231 info->offset_dt = vect_unknown_def_type;
3232 info->offset_vectype = NULL_TREE;
3233 info->scale = scale;
3234 return true;
3237 /* Function vect_analyze_data_refs.
3239 Find all the data references in the loop or basic block.
3241 The general structure of the analysis of data refs in the vectorizer is as
3242 follows:
3243 1- vect_analyze_data_refs(loop/bb): call
3244 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3245 in the loop/bb and their dependences.
3246 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3247 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3248 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3252 bool
3253 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3255 struct loop *loop = NULL;
3256 unsigned int i;
3257 struct data_reference *dr;
3258 tree scalar_type;
3260 if (dump_enabled_p ())
3261 dump_printf_loc (MSG_NOTE, vect_location,
3262 "=== vect_analyze_data_refs ===\n");
3264 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3265 loop = LOOP_VINFO_LOOP (loop_vinfo);
3267 /* Go through the data-refs, check that the analysis succeeded. Update
3268 pointer from stmt_vec_info struct to DR and vectype. */
3270 vec<data_reference_p> datarefs = vinfo->datarefs;
3271 FOR_EACH_VEC_ELT (datarefs, i, dr)
3273 gimple *stmt;
3274 stmt_vec_info stmt_info;
3275 tree base, offset, init;
3276 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3277 bool simd_lane_access = false;
3278 int vf;
3280 again:
3281 if (!dr || !DR_REF (dr))
3283 if (dump_enabled_p ())
3284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3285 "not vectorized: unhandled data-ref\n");
3286 return false;
3289 stmt = DR_STMT (dr);
3290 stmt_info = vinfo_for_stmt (stmt);
3292 /* Discard clobbers from the dataref vector. We will remove
3293 clobber stmts during vectorization. */
3294 if (gimple_clobber_p (stmt))
3296 free_data_ref (dr);
3297 if (i == datarefs.length () - 1)
3299 datarefs.pop ();
3300 break;
3302 datarefs.ordered_remove (i);
3303 dr = datarefs[i];
3304 goto again;
3307 /* Check that analysis of the data-ref succeeded. */
3308 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3309 || !DR_STEP (dr))
3311 bool maybe_gather
3312 = DR_IS_READ (dr)
3313 && !TREE_THIS_VOLATILE (DR_REF (dr))
3314 && targetm.vectorize.builtin_gather != NULL;
3315 bool maybe_scatter
3316 = DR_IS_WRITE (dr)
3317 && !TREE_THIS_VOLATILE (DR_REF (dr))
3318 && targetm.vectorize.builtin_scatter != NULL;
3319 bool maybe_simd_lane_access
3320 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3322 /* If target supports vector gather loads or scatter stores, or if
3323 this might be a SIMD lane access, see if they can't be used. */
3324 if (is_a <loop_vec_info> (vinfo)
3325 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3326 && !nested_in_vect_loop_p (loop, stmt))
3328 struct data_reference *newdr
3329 = create_data_ref (NULL, loop_containing_stmt (stmt),
3330 DR_REF (dr), stmt, maybe_scatter ? false : true);
3331 gcc_assert (newdr != NULL && DR_REF (newdr));
3332 if (DR_BASE_ADDRESS (newdr)
3333 && DR_OFFSET (newdr)
3334 && DR_INIT (newdr)
3335 && DR_STEP (newdr)
3336 && integer_zerop (DR_STEP (newdr)))
3338 if (maybe_simd_lane_access)
3340 tree off = DR_OFFSET (newdr);
3341 STRIP_NOPS (off);
3342 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3343 && TREE_CODE (off) == MULT_EXPR
3344 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3346 tree step = TREE_OPERAND (off, 1);
3347 off = TREE_OPERAND (off, 0);
3348 STRIP_NOPS (off);
3349 if (CONVERT_EXPR_P (off)
3350 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3351 0)))
3352 < TYPE_PRECISION (TREE_TYPE (off)))
3353 off = TREE_OPERAND (off, 0);
3354 if (TREE_CODE (off) == SSA_NAME)
3356 gimple *def = SSA_NAME_DEF_STMT (off);
3357 tree reft = TREE_TYPE (DR_REF (newdr));
3358 if (is_gimple_call (def)
3359 && gimple_call_internal_p (def)
3360 && (gimple_call_internal_fn (def)
3361 == IFN_GOMP_SIMD_LANE))
3363 tree arg = gimple_call_arg (def, 0);
3364 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3365 arg = SSA_NAME_VAR (arg);
3366 if (arg == loop->simduid
3367 /* For now. */
3368 && tree_int_cst_equal
3369 (TYPE_SIZE_UNIT (reft),
3370 step))
3372 DR_OFFSET (newdr) = ssize_int (0);
3373 DR_STEP (newdr) = step;
3374 DR_OFFSET_ALIGNMENT (newdr)
3375 = BIGGEST_ALIGNMENT;
3376 DR_STEP_ALIGNMENT (newdr)
3377 = highest_pow2_factor (step);
3378 dr = newdr;
3379 simd_lane_access = true;
3385 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3387 dr = newdr;
3388 if (maybe_gather)
3389 gatherscatter = GATHER;
3390 else
3391 gatherscatter = SCATTER;
3394 if (gatherscatter == SG_NONE && !simd_lane_access)
3395 free_data_ref (newdr);
3398 if (gatherscatter == SG_NONE && !simd_lane_access)
3400 if (dump_enabled_p ())
3402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3403 "not vectorized: data ref analysis "
3404 "failed ");
3405 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3408 if (is_a <bb_vec_info> (vinfo))
3409 break;
3411 return false;
3415 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3417 if (dump_enabled_p ())
3418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3419 "not vectorized: base addr of dr is a "
3420 "constant\n");
3422 if (is_a <bb_vec_info> (vinfo))
3423 break;
3425 if (gatherscatter != SG_NONE || simd_lane_access)
3426 free_data_ref (dr);
3427 return false;
3430 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3432 if (dump_enabled_p ())
3434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3435 "not vectorized: volatile type ");
3436 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3439 if (is_a <bb_vec_info> (vinfo))
3440 break;
3442 return false;
3445 if (stmt_can_throw_internal (stmt))
3447 if (dump_enabled_p ())
3449 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3450 "not vectorized: statement can throw an "
3451 "exception ");
3452 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3455 if (is_a <bb_vec_info> (vinfo))
3456 break;
3458 if (gatherscatter != SG_NONE || simd_lane_access)
3459 free_data_ref (dr);
3460 return false;
3463 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3464 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3466 if (dump_enabled_p ())
3468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3469 "not vectorized: statement is bitfield "
3470 "access ");
3471 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3474 if (is_a <bb_vec_info> (vinfo))
3475 break;
3477 if (gatherscatter != SG_NONE || simd_lane_access)
3478 free_data_ref (dr);
3479 return false;
3482 base = unshare_expr (DR_BASE_ADDRESS (dr));
3483 offset = unshare_expr (DR_OFFSET (dr));
3484 init = unshare_expr (DR_INIT (dr));
3486 if (is_gimple_call (stmt)
3487 && (!gimple_call_internal_p (stmt)
3488 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3489 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3491 if (dump_enabled_p ())
3493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3494 "not vectorized: dr in a call ");
3495 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3498 if (is_a <bb_vec_info> (vinfo))
3499 break;
3501 if (gatherscatter != SG_NONE || simd_lane_access)
3502 free_data_ref (dr);
3503 return false;
3506 /* Update DR field in stmt_vec_info struct. */
3508 /* If the dataref is in an inner-loop of the loop that is considered for
3509 for vectorization, we also want to analyze the access relative to
3510 the outer-loop (DR contains information only relative to the
3511 inner-most enclosing loop). We do that by building a reference to the
3512 first location accessed by the inner-loop, and analyze it relative to
3513 the outer-loop. */
3514 if (loop && nested_in_vect_loop_p (loop, stmt))
3516 /* Build a reference to the first location accessed by the
3517 inner loop: *(BASE + INIT + OFFSET). By construction,
3518 this address must be invariant in the inner loop, so we
3519 can consider it as being used in the outer loop. */
3520 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
3521 init, offset);
3522 tree init_addr = fold_build_pointer_plus (base, init_offset);
3523 tree init_ref = build_fold_indirect_ref (init_addr);
3525 if (dump_enabled_p ())
3527 dump_printf_loc (MSG_NOTE, vect_location,
3528 "analyze in outer loop: ");
3529 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
3530 dump_printf (MSG_NOTE, "\n");
3533 if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
3534 init_ref, loop))
3535 /* dr_analyze_innermost already explained the failure. */
3536 return false;
3538 if (dump_enabled_p ())
3540 dump_printf_loc (MSG_NOTE, vect_location,
3541 "\touter base_address: ");
3542 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3543 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3544 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3545 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3546 STMT_VINFO_DR_OFFSET (stmt_info));
3547 dump_printf (MSG_NOTE,
3548 "\n\touter constant offset from base address: ");
3549 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3550 STMT_VINFO_DR_INIT (stmt_info));
3551 dump_printf (MSG_NOTE, "\n\touter step: ");
3552 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3553 STMT_VINFO_DR_STEP (stmt_info));
3554 dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
3555 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
3556 dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
3557 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
3558 dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
3559 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
3560 dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
3561 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
3565 if (STMT_VINFO_DATA_REF (stmt_info))
3567 if (dump_enabled_p ())
3569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3570 "not vectorized: more than one data ref "
3571 "in stmt: ");
3572 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3575 if (is_a <bb_vec_info> (vinfo))
3576 break;
3578 if (gatherscatter != SG_NONE || simd_lane_access)
3579 free_data_ref (dr);
3580 return false;
3583 STMT_VINFO_DATA_REF (stmt_info) = dr;
3584 if (simd_lane_access)
3586 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3587 free_data_ref (datarefs[i]);
3588 datarefs[i] = dr;
3591 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3592 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3593 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3595 if (dump_enabled_p ())
3597 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3598 "not vectorized: base object not addressable "
3599 "for stmt: ");
3600 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3602 if (is_a <bb_vec_info> (vinfo))
3604 /* In BB vectorization the ref can still participate
3605 in dependence analysis, we just can't vectorize it. */
3606 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3607 continue;
3609 return false;
3612 /* Set vectype for STMT. */
3613 scalar_type = TREE_TYPE (DR_REF (dr));
3614 STMT_VINFO_VECTYPE (stmt_info)
3615 = get_vectype_for_scalar_type (scalar_type);
3616 if (!STMT_VINFO_VECTYPE (stmt_info))
3618 if (dump_enabled_p ())
3620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3621 "not vectorized: no vectype for stmt: ");
3622 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3623 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3624 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3625 scalar_type);
3626 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3629 if (is_a <bb_vec_info> (vinfo))
3631 /* No vector type is fine, the ref can still participate
3632 in dependence analysis, we just can't vectorize it. */
3633 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3634 continue;
3637 if (gatherscatter != SG_NONE || simd_lane_access)
3639 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3640 if (gatherscatter != SG_NONE)
3641 free_data_ref (dr);
3643 return false;
3645 else
3647 if (dump_enabled_p ())
3649 dump_printf_loc (MSG_NOTE, vect_location,
3650 "got vectype for stmt: ");
3651 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3652 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3653 STMT_VINFO_VECTYPE (stmt_info));
3654 dump_printf (MSG_NOTE, "\n");
3658 /* Adjust the minimal vectorization factor according to the
3659 vector type. */
3660 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3661 if (vf > *min_vf)
3662 *min_vf = vf;
3664 if (gatherscatter != SG_NONE)
3666 gather_scatter_info gs_info;
3667 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3668 &gs_info)
3669 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3671 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3672 free_data_ref (dr);
3673 if (dump_enabled_p ())
3675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3676 (gatherscatter == GATHER) ?
3677 "not vectorized: not suitable for gather "
3678 "load " :
3679 "not vectorized: not suitable for scatter "
3680 "store ");
3681 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3683 return false;
3686 free_data_ref (datarefs[i]);
3687 datarefs[i] = dr;
3688 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3691 else if (is_a <loop_vec_info> (vinfo)
3692 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3694 if (nested_in_vect_loop_p (loop, stmt))
3696 if (dump_enabled_p ())
3698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3699 "not vectorized: not suitable for strided "
3700 "load ");
3701 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3703 return false;
3705 STMT_VINFO_STRIDED_P (stmt_info) = true;
3709 /* If we stopped analysis at the first dataref we could not analyze
3710 when trying to vectorize a basic-block mark the rest of the datarefs
3711 as not vectorizable and truncate the vector of datarefs. That
3712 avoids spending useless time in analyzing their dependence. */
3713 if (i != datarefs.length ())
3715 gcc_assert (is_a <bb_vec_info> (vinfo));
3716 for (unsigned j = i; j < datarefs.length (); ++j)
3718 data_reference_p dr = datarefs[j];
3719 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3720 free_data_ref (dr);
3722 datarefs.truncate (i);
3725 return true;
3729 /* Function vect_get_new_vect_var.
3731 Returns a name for a new variable. The current naming scheme appends the
3732 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3733 the name of vectorizer generated variables, and appends that to NAME if
3734 provided. */
3736 tree
3737 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3739 const char *prefix;
3740 tree new_vect_var;
3742 switch (var_kind)
3744 case vect_simple_var:
3745 prefix = "vect";
3746 break;
3747 case vect_scalar_var:
3748 prefix = "stmp";
3749 break;
3750 case vect_mask_var:
3751 prefix = "mask";
3752 break;
3753 case vect_pointer_var:
3754 prefix = "vectp";
3755 break;
3756 default:
3757 gcc_unreachable ();
3760 if (name)
3762 char* tmp = concat (prefix, "_", name, NULL);
3763 new_vect_var = create_tmp_reg (type, tmp);
3764 free (tmp);
3766 else
3767 new_vect_var = create_tmp_reg (type, prefix);
3769 return new_vect_var;
3772 /* Like vect_get_new_vect_var but return an SSA name. */
3774 tree
3775 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3777 const char *prefix;
3778 tree new_vect_var;
3780 switch (var_kind)
3782 case vect_simple_var:
3783 prefix = "vect";
3784 break;
3785 case vect_scalar_var:
3786 prefix = "stmp";
3787 break;
3788 case vect_pointer_var:
3789 prefix = "vectp";
3790 break;
3791 default:
3792 gcc_unreachable ();
3795 if (name)
3797 char* tmp = concat (prefix, "_", name, NULL);
3798 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3799 free (tmp);
3801 else
3802 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3804 return new_vect_var;
3807 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3809 static void
3810 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3811 stmt_vec_info stmt_info)
3813 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3814 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3815 int misalign = DR_MISALIGNMENT (dr);
3816 if (misalign == DR_MISALIGNMENT_UNKNOWN)
3817 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3818 else
3819 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3822 /* Function vect_create_addr_base_for_vector_ref.
3824 Create an expression that computes the address of the first memory location
3825 that will be accessed for a data reference.
3827 Input:
3828 STMT: The statement containing the data reference.
3829 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3830 OFFSET: Optional. If supplied, it is be added to the initial address.
3831 LOOP: Specify relative to which loop-nest should the address be computed.
3832 For example, when the dataref is in an inner-loop nested in an
3833 outer-loop that is now being vectorized, LOOP can be either the
3834 outer-loop, or the inner-loop. The first memory location accessed
3835 by the following dataref ('in' points to short):
3837 for (i=0; i<N; i++)
3838 for (j=0; j<M; j++)
3839 s += in[i+j]
3841 is as follows:
3842 if LOOP=i_loop: &in (relative to i_loop)
3843 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3844 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3845 initial address. Unlike OFFSET, which is number of elements to
3846 be added, BYTE_OFFSET is measured in bytes.
3848 Output:
3849 1. Return an SSA_NAME whose value is the address of the memory location of
3850 the first vector of the data reference.
3851 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3852 these statement(s) which define the returned SSA_NAME.
3854 FORNOW: We are only handling array accesses with step 1. */
3856 tree
3857 vect_create_addr_base_for_vector_ref (gimple *stmt,
3858 gimple_seq *new_stmt_list,
3859 tree offset,
3860 tree byte_offset)
3862 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3863 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3864 const char *base_name;
3865 tree addr_base;
3866 tree dest;
3867 gimple_seq seq = NULL;
3868 tree vect_ptr_type;
3869 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3870 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3871 innermost_loop_behavior *drb = vect_dr_behavior (dr);
3873 tree data_ref_base = unshare_expr (drb->base_address);
3874 tree base_offset = unshare_expr (drb->offset);
3875 tree init = unshare_expr (drb->init);
3877 if (loop_vinfo)
3878 base_name = get_name (data_ref_base);
3879 else
3881 base_offset = ssize_int (0);
3882 init = ssize_int (0);
3883 base_name = get_name (DR_REF (dr));
3886 /* Create base_offset */
3887 base_offset = size_binop (PLUS_EXPR,
3888 fold_convert (sizetype, base_offset),
3889 fold_convert (sizetype, init));
3891 if (offset)
3893 offset = fold_build2 (MULT_EXPR, sizetype,
3894 fold_convert (sizetype, offset), step);
3895 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3896 base_offset, offset);
3898 if (byte_offset)
3900 byte_offset = fold_convert (sizetype, byte_offset);
3901 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3902 base_offset, byte_offset);
3905 /* base + base_offset */
3906 if (loop_vinfo)
3907 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3908 else
3910 addr_base = build1 (ADDR_EXPR,
3911 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3912 unshare_expr (DR_REF (dr)));
3915 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3916 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3917 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
3918 gimple_seq_add_seq (new_stmt_list, seq);
3920 if (DR_PTR_INFO (dr)
3921 && TREE_CODE (addr_base) == SSA_NAME
3922 && !SSA_NAME_PTR_INFO (addr_base))
3924 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3925 if (offset || byte_offset)
3926 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3929 if (dump_enabled_p ())
3931 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3932 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3933 dump_printf (MSG_NOTE, "\n");
3936 return addr_base;
3940 /* Function vect_create_data_ref_ptr.
3942 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3943 location accessed in the loop by STMT, along with the def-use update
3944 chain to appropriately advance the pointer through the loop iterations.
3945 Also set aliasing information for the pointer. This pointer is used by
3946 the callers to this function to create a memory reference expression for
3947 vector load/store access.
3949 Input:
3950 1. STMT: a stmt that references memory. Expected to be of the form
3951 GIMPLE_ASSIGN <name, data-ref> or
3952 GIMPLE_ASSIGN <data-ref, name>.
3953 2. AGGR_TYPE: the type of the reference, which should be either a vector
3954 or an array.
3955 3. AT_LOOP: the loop where the vector memref is to be created.
3956 4. OFFSET (optional): an offset to be added to the initial address accessed
3957 by the data-ref in STMT.
3958 5. BSI: location where the new stmts are to be placed if there is no loop
3959 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3960 pointing to the initial address.
3961 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
3962 to the initial address accessed by the data-ref in STMT. This is
3963 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
3964 in bytes.
3966 Output:
3967 1. Declare a new ptr to vector_type, and have it point to the base of the
3968 data reference (initial addressed accessed by the data reference).
3969 For example, for vector of type V8HI, the following code is generated:
3971 v8hi *ap;
3972 ap = (v8hi *)initial_address;
3974 if OFFSET is not supplied:
3975 initial_address = &a[init];
3976 if OFFSET is supplied:
3977 initial_address = &a[init + OFFSET];
3978 if BYTE_OFFSET is supplied:
3979 initial_address = &a[init] + BYTE_OFFSET;
3981 Return the initial_address in INITIAL_ADDRESS.
3983 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3984 update the pointer in each iteration of the loop.
3986 Return the increment stmt that updates the pointer in PTR_INCR.
3988 3. Set INV_P to true if the access pattern of the data reference in the
3989 vectorized loop is invariant. Set it to false otherwise.
3991 4. Return the pointer. */
3993 tree
3994 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
3995 tree offset, tree *initial_address,
3996 gimple_stmt_iterator *gsi, gimple **ptr_incr,
3997 bool only_init, bool *inv_p, tree byte_offset)
3999 const char *base_name;
4000 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4001 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4002 struct loop *loop = NULL;
4003 bool nested_in_vect_loop = false;
4004 struct loop *containing_loop = NULL;
4005 tree aggr_ptr_type;
4006 tree aggr_ptr;
4007 tree new_temp;
4008 gimple_seq new_stmt_list = NULL;
4009 edge pe = NULL;
4010 basic_block new_bb;
4011 tree aggr_ptr_init;
4012 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4013 tree aptr;
4014 gimple_stmt_iterator incr_gsi;
4015 bool insert_after;
4016 tree indx_before_incr, indx_after_incr;
4017 gimple *incr;
4018 tree step;
4019 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4021 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4022 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4024 if (loop_vinfo)
4026 loop = LOOP_VINFO_LOOP (loop_vinfo);
4027 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4028 containing_loop = (gimple_bb (stmt))->loop_father;
4029 pe = loop_preheader_edge (loop);
4031 else
4033 gcc_assert (bb_vinfo);
4034 only_init = true;
4035 *ptr_incr = NULL;
4038 /* Check the step (evolution) of the load in LOOP, and record
4039 whether it's invariant. */
4040 step = vect_dr_behavior (dr)->step;
4041 if (integer_zerop (step))
4042 *inv_p = true;
4043 else
4044 *inv_p = false;
4046 /* Create an expression for the first address accessed by this load
4047 in LOOP. */
4048 base_name = get_name (DR_BASE_ADDRESS (dr));
4050 if (dump_enabled_p ())
4052 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4053 dump_printf_loc (MSG_NOTE, vect_location,
4054 "create %s-pointer variable to type: ",
4055 get_tree_code_name (TREE_CODE (aggr_type)));
4056 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4057 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4058 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4059 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4060 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4061 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4062 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4063 else
4064 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4065 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4066 dump_printf (MSG_NOTE, "\n");
4069 /* (1) Create the new aggregate-pointer variable.
4070 Vector and array types inherit the alias set of their component
4071 type by default so we need to use a ref-all pointer if the data
4072 reference does not conflict with the created aggregated data
4073 reference because it is not addressable. */
4074 bool need_ref_all = false;
4075 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4076 get_alias_set (DR_REF (dr))))
4077 need_ref_all = true;
4078 /* Likewise for any of the data references in the stmt group. */
4079 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4081 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4084 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4085 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4086 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4087 get_alias_set (DR_REF (sdr))))
4089 need_ref_all = true;
4090 break;
4092 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4094 while (orig_stmt);
4096 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4097 need_ref_all);
4098 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4101 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4102 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4103 def-use update cycles for the pointer: one relative to the outer-loop
4104 (LOOP), which is what steps (3) and (4) below do. The other is relative
4105 to the inner-loop (which is the inner-most loop containing the dataref),
4106 and this is done be step (5) below.
4108 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4109 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4110 redundant. Steps (3),(4) create the following:
4112 vp0 = &base_addr;
4113 LOOP: vp1 = phi(vp0,vp2)
4116 vp2 = vp1 + step
4117 goto LOOP
4119 If there is an inner-loop nested in loop, then step (5) will also be
4120 applied, and an additional update in the inner-loop will be created:
4122 vp0 = &base_addr;
4123 LOOP: vp1 = phi(vp0,vp2)
4125 inner: vp3 = phi(vp1,vp4)
4126 vp4 = vp3 + inner_step
4127 if () goto inner
4129 vp2 = vp1 + step
4130 if () goto LOOP */
4132 /* (2) Calculate the initial address of the aggregate-pointer, and set
4133 the aggregate-pointer to point to it before the loop. */
4135 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4137 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4138 offset, byte_offset);
4139 if (new_stmt_list)
4141 if (pe)
4143 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4144 gcc_assert (!new_bb);
4146 else
4147 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4150 *initial_address = new_temp;
4151 aggr_ptr_init = new_temp;
4153 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4154 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4155 inner-loop nested in LOOP (during outer-loop vectorization). */
4157 /* No update in loop is required. */
4158 if (only_init && (!loop_vinfo || at_loop == loop))
4159 aptr = aggr_ptr_init;
4160 else
4162 /* The step of the aggregate pointer is the type size. */
4163 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4164 /* One exception to the above is when the scalar step of the load in
4165 LOOP is zero. In this case the step here is also zero. */
4166 if (*inv_p)
4167 iv_step = size_zero_node;
4168 else if (tree_int_cst_sgn (step) == -1)
4169 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4171 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4173 create_iv (aggr_ptr_init,
4174 fold_convert (aggr_ptr_type, iv_step),
4175 aggr_ptr, loop, &incr_gsi, insert_after,
4176 &indx_before_incr, &indx_after_incr);
4177 incr = gsi_stmt (incr_gsi);
4178 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4180 /* Copy the points-to information if it exists. */
4181 if (DR_PTR_INFO (dr))
4183 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4184 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4186 if (ptr_incr)
4187 *ptr_incr = incr;
4189 aptr = indx_before_incr;
4192 if (!nested_in_vect_loop || only_init)
4193 return aptr;
4196 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4197 nested in LOOP, if exists. */
4199 gcc_assert (nested_in_vect_loop);
4200 if (!only_init)
4202 standard_iv_increment_position (containing_loop, &incr_gsi,
4203 &insert_after);
4204 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4205 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4206 &indx_after_incr);
4207 incr = gsi_stmt (incr_gsi);
4208 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4210 /* Copy the points-to information if it exists. */
4211 if (DR_PTR_INFO (dr))
4213 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4214 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4216 if (ptr_incr)
4217 *ptr_incr = incr;
4219 return indx_before_incr;
4221 else
4222 gcc_unreachable ();
4226 /* Function bump_vector_ptr
4228 Increment a pointer (to a vector type) by vector-size. If requested,
4229 i.e. if PTR-INCR is given, then also connect the new increment stmt
4230 to the existing def-use update-chain of the pointer, by modifying
4231 the PTR_INCR as illustrated below:
4233 The pointer def-use update-chain before this function:
4234 DATAREF_PTR = phi (p_0, p_2)
4235 ....
4236 PTR_INCR: p_2 = DATAREF_PTR + step
4238 The pointer def-use update-chain after this function:
4239 DATAREF_PTR = phi (p_0, p_2)
4240 ....
4241 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4242 ....
4243 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4245 Input:
4246 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4247 in the loop.
4248 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4249 the loop. The increment amount across iterations is expected
4250 to be vector_size.
4251 BSI - location where the new update stmt is to be placed.
4252 STMT - the original scalar memory-access stmt that is being vectorized.
4253 BUMP - optional. The offset by which to bump the pointer. If not given,
4254 the offset is assumed to be vector_size.
4256 Output: Return NEW_DATAREF_PTR as illustrated above.
4260 tree
4261 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4262 gimple *stmt, tree bump)
4264 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4265 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4266 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4267 tree update = TYPE_SIZE_UNIT (vectype);
4268 gassign *incr_stmt;
4269 ssa_op_iter iter;
4270 use_operand_p use_p;
4271 tree new_dataref_ptr;
4273 if (bump)
4274 update = bump;
4276 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4277 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4278 else
4279 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4280 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4281 dataref_ptr, update);
4282 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4284 /* Copy the points-to information if it exists. */
4285 if (DR_PTR_INFO (dr))
4287 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4288 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4291 if (!ptr_incr)
4292 return new_dataref_ptr;
4294 /* Update the vector-pointer's cross-iteration increment. */
4295 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4297 tree use = USE_FROM_PTR (use_p);
4299 if (use == dataref_ptr)
4300 SET_USE (use_p, new_dataref_ptr);
4301 else
4302 gcc_assert (tree_int_cst_compare (use, update) == 0);
4305 return new_dataref_ptr;
4309 /* Function vect_create_destination_var.
4311 Create a new temporary of type VECTYPE. */
4313 tree
4314 vect_create_destination_var (tree scalar_dest, tree vectype)
4316 tree vec_dest;
4317 const char *name;
4318 char *new_name;
4319 tree type;
4320 enum vect_var_kind kind;
4322 kind = vectype
4323 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4324 ? vect_mask_var
4325 : vect_simple_var
4326 : vect_scalar_var;
4327 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4329 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4331 name = get_name (scalar_dest);
4332 if (name)
4333 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4334 else
4335 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4336 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4337 free (new_name);
4339 return vec_dest;
4342 /* Function vect_grouped_store_supported.
4344 Returns TRUE if interleave high and interleave low permutations
4345 are supported, and FALSE otherwise. */
4347 bool
4348 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4350 machine_mode mode = TYPE_MODE (vectype);
4352 /* vect_permute_store_chain requires the group size to be equal to 3 or
4353 be a power of two. */
4354 if (count != 3 && exact_log2 (count) == -1)
4356 if (dump_enabled_p ())
4357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4358 "the size of the group of accesses"
4359 " is not a power of 2 or not eqaul to 3\n");
4360 return false;
4363 /* Check that the permutation is supported. */
4364 if (VECTOR_MODE_P (mode))
4366 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4367 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4369 if (count == 3)
4371 unsigned int j0 = 0, j1 = 0, j2 = 0;
4372 unsigned int i, j;
4374 for (j = 0; j < 3; j++)
4376 int nelt0 = ((3 - j) * nelt) % 3;
4377 int nelt1 = ((3 - j) * nelt + 1) % 3;
4378 int nelt2 = ((3 - j) * nelt + 2) % 3;
4379 for (i = 0; i < nelt; i++)
4381 if (3 * i + nelt0 < nelt)
4382 sel[3 * i + nelt0] = j0++;
4383 if (3 * i + nelt1 < nelt)
4384 sel[3 * i + nelt1] = nelt + j1++;
4385 if (3 * i + nelt2 < nelt)
4386 sel[3 * i + nelt2] = 0;
4388 if (!can_vec_perm_p (mode, false, sel))
4390 if (dump_enabled_p ())
4391 dump_printf (MSG_MISSED_OPTIMIZATION,
4392 "permutaion op not supported by target.\n");
4393 return false;
4396 for (i = 0; i < nelt; i++)
4398 if (3 * i + nelt0 < nelt)
4399 sel[3 * i + nelt0] = 3 * i + nelt0;
4400 if (3 * i + nelt1 < nelt)
4401 sel[3 * i + nelt1] = 3 * i + nelt1;
4402 if (3 * i + nelt2 < nelt)
4403 sel[3 * i + nelt2] = nelt + j2++;
4405 if (!can_vec_perm_p (mode, false, sel))
4407 if (dump_enabled_p ())
4408 dump_printf (MSG_MISSED_OPTIMIZATION,
4409 "permutaion op not supported by target.\n");
4410 return false;
4413 return true;
4415 else
4417 /* If length is not equal to 3 then only power of 2 is supported. */
4418 gcc_assert (pow2p_hwi (count));
4420 for (i = 0; i < nelt / 2; i++)
4422 sel[i * 2] = i;
4423 sel[i * 2 + 1] = i + nelt;
4425 if (can_vec_perm_p (mode, false, sel))
4427 for (i = 0; i < nelt; i++)
4428 sel[i] += nelt / 2;
4429 if (can_vec_perm_p (mode, false, sel))
4430 return true;
4435 if (dump_enabled_p ())
4436 dump_printf (MSG_MISSED_OPTIMIZATION,
4437 "permutaion op not supported by target.\n");
4438 return false;
4442 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4443 type VECTYPE. */
4445 bool
4446 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4448 return vect_lanes_optab_supported_p ("vec_store_lanes",
4449 vec_store_lanes_optab,
4450 vectype, count);
4454 /* Function vect_permute_store_chain.
4456 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4457 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4458 the data correctly for the stores. Return the final references for stores
4459 in RESULT_CHAIN.
4461 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4462 The input is 4 vectors each containing 8 elements. We assign a number to
4463 each element, the input sequence is:
4465 1st vec: 0 1 2 3 4 5 6 7
4466 2nd vec: 8 9 10 11 12 13 14 15
4467 3rd vec: 16 17 18 19 20 21 22 23
4468 4th vec: 24 25 26 27 28 29 30 31
4470 The output sequence should be:
4472 1st vec: 0 8 16 24 1 9 17 25
4473 2nd vec: 2 10 18 26 3 11 19 27
4474 3rd vec: 4 12 20 28 5 13 21 30
4475 4th vec: 6 14 22 30 7 15 23 31
4477 i.e., we interleave the contents of the four vectors in their order.
4479 We use interleave_high/low instructions to create such output. The input of
4480 each interleave_high/low operation is two vectors:
4481 1st vec 2nd vec
4482 0 1 2 3 4 5 6 7
4483 the even elements of the result vector are obtained left-to-right from the
4484 high/low elements of the first vector. The odd elements of the result are
4485 obtained left-to-right from the high/low elements of the second vector.
4486 The output of interleave_high will be: 0 4 1 5
4487 and of interleave_low: 2 6 3 7
4490 The permutation is done in log LENGTH stages. In each stage interleave_high
4491 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4492 where the first argument is taken from the first half of DR_CHAIN and the
4493 second argument from it's second half.
4494 In our example,
4496 I1: interleave_high (1st vec, 3rd vec)
4497 I2: interleave_low (1st vec, 3rd vec)
4498 I3: interleave_high (2nd vec, 4th vec)
4499 I4: interleave_low (2nd vec, 4th vec)
4501 The output for the first stage is:
4503 I1: 0 16 1 17 2 18 3 19
4504 I2: 4 20 5 21 6 22 7 23
4505 I3: 8 24 9 25 10 26 11 27
4506 I4: 12 28 13 29 14 30 15 31
4508 The output of the second stage, i.e. the final result is:
4510 I1: 0 8 16 24 1 9 17 25
4511 I2: 2 10 18 26 3 11 19 27
4512 I3: 4 12 20 28 5 13 21 30
4513 I4: 6 14 22 30 7 15 23 31. */
4515 void
4516 vect_permute_store_chain (vec<tree> dr_chain,
4517 unsigned int length,
4518 gimple *stmt,
4519 gimple_stmt_iterator *gsi,
4520 vec<tree> *result_chain)
4522 tree vect1, vect2, high, low;
4523 gimple *perm_stmt;
4524 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4525 tree perm_mask_low, perm_mask_high;
4526 tree data_ref;
4527 tree perm3_mask_low, perm3_mask_high;
4528 unsigned int i, n, log_length = exact_log2 (length);
4529 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4530 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4532 result_chain->quick_grow (length);
4533 memcpy (result_chain->address (), dr_chain.address (),
4534 length * sizeof (tree));
4536 if (length == 3)
4538 unsigned int j0 = 0, j1 = 0, j2 = 0;
4540 for (j = 0; j < 3; j++)
4542 int nelt0 = ((3 - j) * nelt) % 3;
4543 int nelt1 = ((3 - j) * nelt + 1) % 3;
4544 int nelt2 = ((3 - j) * nelt + 2) % 3;
4546 for (i = 0; i < nelt; i++)
4548 if (3 * i + nelt0 < nelt)
4549 sel[3 * i + nelt0] = j0++;
4550 if (3 * i + nelt1 < nelt)
4551 sel[3 * i + nelt1] = nelt + j1++;
4552 if (3 * i + nelt2 < nelt)
4553 sel[3 * i + nelt2] = 0;
4555 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4557 for (i = 0; i < nelt; i++)
4559 if (3 * i + nelt0 < nelt)
4560 sel[3 * i + nelt0] = 3 * i + nelt0;
4561 if (3 * i + nelt1 < nelt)
4562 sel[3 * i + nelt1] = 3 * i + nelt1;
4563 if (3 * i + nelt2 < nelt)
4564 sel[3 * i + nelt2] = nelt + j2++;
4566 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4568 vect1 = dr_chain[0];
4569 vect2 = dr_chain[1];
4571 /* Create interleaving stmt:
4572 low = VEC_PERM_EXPR <vect1, vect2,
4573 {j, nelt, *, j + 1, nelt + j + 1, *,
4574 j + 2, nelt + j + 2, *, ...}> */
4575 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4576 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4577 vect2, perm3_mask_low);
4578 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4580 vect1 = data_ref;
4581 vect2 = dr_chain[2];
4582 /* Create interleaving stmt:
4583 low = VEC_PERM_EXPR <vect1, vect2,
4584 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4585 6, 7, nelt + j + 2, ...}> */
4586 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4587 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4588 vect2, perm3_mask_high);
4589 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4590 (*result_chain)[j] = data_ref;
4593 else
4595 /* If length is not equal to 3 then only power of 2 is supported. */
4596 gcc_assert (pow2p_hwi (length));
4598 for (i = 0, n = nelt / 2; i < n; i++)
4600 sel[i * 2] = i;
4601 sel[i * 2 + 1] = i + nelt;
4603 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4605 for (i = 0; i < nelt; i++)
4606 sel[i] += nelt / 2;
4607 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4609 for (i = 0, n = log_length; i < n; i++)
4611 for (j = 0; j < length/2; j++)
4613 vect1 = dr_chain[j];
4614 vect2 = dr_chain[j+length/2];
4616 /* Create interleaving stmt:
4617 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4618 ...}> */
4619 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4620 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4621 vect2, perm_mask_high);
4622 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4623 (*result_chain)[2*j] = high;
4625 /* Create interleaving stmt:
4626 low = VEC_PERM_EXPR <vect1, vect2,
4627 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4628 ...}> */
4629 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4630 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4631 vect2, perm_mask_low);
4632 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4633 (*result_chain)[2*j+1] = low;
4635 memcpy (dr_chain.address (), result_chain->address (),
4636 length * sizeof (tree));
4641 /* Function vect_setup_realignment
4643 This function is called when vectorizing an unaligned load using
4644 the dr_explicit_realign[_optimized] scheme.
4645 This function generates the following code at the loop prolog:
4647 p = initial_addr;
4648 x msq_init = *(floor(p)); # prolog load
4649 realignment_token = call target_builtin;
4650 loop:
4651 x msq = phi (msq_init, ---)
4653 The stmts marked with x are generated only for the case of
4654 dr_explicit_realign_optimized.
4656 The code above sets up a new (vector) pointer, pointing to the first
4657 location accessed by STMT, and a "floor-aligned" load using that pointer.
4658 It also generates code to compute the "realignment-token" (if the relevant
4659 target hook was defined), and creates a phi-node at the loop-header bb
4660 whose arguments are the result of the prolog-load (created by this
4661 function) and the result of a load that takes place in the loop (to be
4662 created by the caller to this function).
4664 For the case of dr_explicit_realign_optimized:
4665 The caller to this function uses the phi-result (msq) to create the
4666 realignment code inside the loop, and sets up the missing phi argument,
4667 as follows:
4668 loop:
4669 msq = phi (msq_init, lsq)
4670 lsq = *(floor(p')); # load in loop
4671 result = realign_load (msq, lsq, realignment_token);
4673 For the case of dr_explicit_realign:
4674 loop:
4675 msq = *(floor(p)); # load in loop
4676 p' = p + (VS-1);
4677 lsq = *(floor(p')); # load in loop
4678 result = realign_load (msq, lsq, realignment_token);
4680 Input:
4681 STMT - (scalar) load stmt to be vectorized. This load accesses
4682 a memory location that may be unaligned.
4683 BSI - place where new code is to be inserted.
4684 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4685 is used.
4687 Output:
4688 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4689 target hook, if defined.
4690 Return value - the result of the loop-header phi node. */
4692 tree
4693 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4694 tree *realignment_token,
4695 enum dr_alignment_support alignment_support_scheme,
4696 tree init_addr,
4697 struct loop **at_loop)
4699 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4700 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4701 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4702 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4703 struct loop *loop = NULL;
4704 edge pe = NULL;
4705 tree scalar_dest = gimple_assign_lhs (stmt);
4706 tree vec_dest;
4707 gimple *inc;
4708 tree ptr;
4709 tree data_ref;
4710 basic_block new_bb;
4711 tree msq_init = NULL_TREE;
4712 tree new_temp;
4713 gphi *phi_stmt;
4714 tree msq = NULL_TREE;
4715 gimple_seq stmts = NULL;
4716 bool inv_p;
4717 bool compute_in_loop = false;
4718 bool nested_in_vect_loop = false;
4719 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4720 struct loop *loop_for_initial_load = NULL;
4722 if (loop_vinfo)
4724 loop = LOOP_VINFO_LOOP (loop_vinfo);
4725 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4728 gcc_assert (alignment_support_scheme == dr_explicit_realign
4729 || alignment_support_scheme == dr_explicit_realign_optimized);
4731 /* We need to generate three things:
4732 1. the misalignment computation
4733 2. the extra vector load (for the optimized realignment scheme).
4734 3. the phi node for the two vectors from which the realignment is
4735 done (for the optimized realignment scheme). */
4737 /* 1. Determine where to generate the misalignment computation.
4739 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4740 calculation will be generated by this function, outside the loop (in the
4741 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4742 caller, inside the loop.
4744 Background: If the misalignment remains fixed throughout the iterations of
4745 the loop, then both realignment schemes are applicable, and also the
4746 misalignment computation can be done outside LOOP. This is because we are
4747 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4748 are a multiple of VS (the Vector Size), and therefore the misalignment in
4749 different vectorized LOOP iterations is always the same.
4750 The problem arises only if the memory access is in an inner-loop nested
4751 inside LOOP, which is now being vectorized using outer-loop vectorization.
4752 This is the only case when the misalignment of the memory access may not
4753 remain fixed throughout the iterations of the inner-loop (as explained in
4754 detail in vect_supportable_dr_alignment). In this case, not only is the
4755 optimized realignment scheme not applicable, but also the misalignment
4756 computation (and generation of the realignment token that is passed to
4757 REALIGN_LOAD) have to be done inside the loop.
4759 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4760 or not, which in turn determines if the misalignment is computed inside
4761 the inner-loop, or outside LOOP. */
4763 if (init_addr != NULL_TREE || !loop_vinfo)
4765 compute_in_loop = true;
4766 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4770 /* 2. Determine where to generate the extra vector load.
4772 For the optimized realignment scheme, instead of generating two vector
4773 loads in each iteration, we generate a single extra vector load in the
4774 preheader of the loop, and in each iteration reuse the result of the
4775 vector load from the previous iteration. In case the memory access is in
4776 an inner-loop nested inside LOOP, which is now being vectorized using
4777 outer-loop vectorization, we need to determine whether this initial vector
4778 load should be generated at the preheader of the inner-loop, or can be
4779 generated at the preheader of LOOP. If the memory access has no evolution
4780 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4781 to be generated inside LOOP (in the preheader of the inner-loop). */
4783 if (nested_in_vect_loop)
4785 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4786 bool invariant_in_outerloop =
4787 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4788 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4790 else
4791 loop_for_initial_load = loop;
4792 if (at_loop)
4793 *at_loop = loop_for_initial_load;
4795 if (loop_for_initial_load)
4796 pe = loop_preheader_edge (loop_for_initial_load);
4798 /* 3. For the case of the optimized realignment, create the first vector
4799 load at the loop preheader. */
4801 if (alignment_support_scheme == dr_explicit_realign_optimized)
4803 /* Create msq_init = *(floor(p1)) in the loop preheader */
4804 gassign *new_stmt;
4806 gcc_assert (!compute_in_loop);
4807 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4808 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4809 NULL_TREE, &init_addr, NULL, &inc,
4810 true, &inv_p);
4811 if (TREE_CODE (ptr) == SSA_NAME)
4812 new_temp = copy_ssa_name (ptr);
4813 else
4814 new_temp = make_ssa_name (TREE_TYPE (ptr));
4815 new_stmt = gimple_build_assign
4816 (new_temp, BIT_AND_EXPR, ptr,
4817 build_int_cst (TREE_TYPE (ptr),
4818 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4819 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4820 gcc_assert (!new_bb);
4821 data_ref
4822 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4823 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4824 new_stmt = gimple_build_assign (vec_dest, data_ref);
4825 new_temp = make_ssa_name (vec_dest, new_stmt);
4826 gimple_assign_set_lhs (new_stmt, new_temp);
4827 if (pe)
4829 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4830 gcc_assert (!new_bb);
4832 else
4833 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4835 msq_init = gimple_assign_lhs (new_stmt);
4838 /* 4. Create realignment token using a target builtin, if available.
4839 It is done either inside the containing loop, or before LOOP (as
4840 determined above). */
4842 if (targetm.vectorize.builtin_mask_for_load)
4844 gcall *new_stmt;
4845 tree builtin_decl;
4847 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4848 if (!init_addr)
4850 /* Generate the INIT_ADDR computation outside LOOP. */
4851 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4852 NULL_TREE);
4853 if (loop)
4855 pe = loop_preheader_edge (loop);
4856 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4857 gcc_assert (!new_bb);
4859 else
4860 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4863 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4864 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4865 vec_dest =
4866 vect_create_destination_var (scalar_dest,
4867 gimple_call_return_type (new_stmt));
4868 new_temp = make_ssa_name (vec_dest, new_stmt);
4869 gimple_call_set_lhs (new_stmt, new_temp);
4871 if (compute_in_loop)
4872 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4873 else
4875 /* Generate the misalignment computation outside LOOP. */
4876 pe = loop_preheader_edge (loop);
4877 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4878 gcc_assert (!new_bb);
4881 *realignment_token = gimple_call_lhs (new_stmt);
4883 /* The result of the CALL_EXPR to this builtin is determined from
4884 the value of the parameter and no global variables are touched
4885 which makes the builtin a "const" function. Requiring the
4886 builtin to have the "const" attribute makes it unnecessary
4887 to call mark_call_clobbered. */
4888 gcc_assert (TREE_READONLY (builtin_decl));
4891 if (alignment_support_scheme == dr_explicit_realign)
4892 return msq;
4894 gcc_assert (!compute_in_loop);
4895 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4898 /* 5. Create msq = phi <msq_init, lsq> in loop */
4900 pe = loop_preheader_edge (containing_loop);
4901 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4902 msq = make_ssa_name (vec_dest);
4903 phi_stmt = create_phi_node (msq, containing_loop->header);
4904 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4906 return msq;
4910 /* Function vect_grouped_load_supported.
4912 COUNT is the size of the load group (the number of statements plus the
4913 number of gaps). SINGLE_ELEMENT_P is true if there is actually
4914 only one statement, with a gap of COUNT - 1.
4916 Returns true if a suitable permute exists. */
4918 bool
4919 vect_grouped_load_supported (tree vectype, bool single_element_p,
4920 unsigned HOST_WIDE_INT count)
4922 machine_mode mode = TYPE_MODE (vectype);
4924 /* If this is single-element interleaving with an element distance
4925 that leaves unused vector loads around punt - we at least create
4926 very sub-optimal code in that case (and blow up memory,
4927 see PR65518). */
4928 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
4930 if (dump_enabled_p ())
4931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4932 "single-element interleaving not supported "
4933 "for not adjacent vector loads\n");
4934 return false;
4937 /* vect_permute_load_chain requires the group size to be equal to 3 or
4938 be a power of two. */
4939 if (count != 3 && exact_log2 (count) == -1)
4941 if (dump_enabled_p ())
4942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4943 "the size of the group of accesses"
4944 " is not a power of 2 or not equal to 3\n");
4945 return false;
4948 /* Check that the permutation is supported. */
4949 if (VECTOR_MODE_P (mode))
4951 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4952 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4954 if (count == 3)
4956 unsigned int k;
4957 for (k = 0; k < 3; k++)
4959 for (i = 0; i < nelt; i++)
4960 if (3 * i + k < 2 * nelt)
4961 sel[i] = 3 * i + k;
4962 else
4963 sel[i] = 0;
4964 if (!can_vec_perm_p (mode, false, sel))
4966 if (dump_enabled_p ())
4967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4968 "shuffle of 3 loads is not supported by"
4969 " target\n");
4970 return false;
4972 for (i = 0, j = 0; i < nelt; i++)
4973 if (3 * i + k < 2 * nelt)
4974 sel[i] = i;
4975 else
4976 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
4977 if (!can_vec_perm_p (mode, false, sel))
4979 if (dump_enabled_p ())
4980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4981 "shuffle of 3 loads is not supported by"
4982 " target\n");
4983 return false;
4986 return true;
4988 else
4990 /* If length is not equal to 3 then only power of 2 is supported. */
4991 gcc_assert (pow2p_hwi (count));
4992 for (i = 0; i < nelt; i++)
4993 sel[i] = i * 2;
4994 if (can_vec_perm_p (mode, false, sel))
4996 for (i = 0; i < nelt; i++)
4997 sel[i] = i * 2 + 1;
4998 if (can_vec_perm_p (mode, false, sel))
4999 return true;
5004 if (dump_enabled_p ())
5005 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5006 "extract even/odd not supported by target\n");
5007 return false;
5010 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5011 type VECTYPE. */
5013 bool
5014 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5016 return vect_lanes_optab_supported_p ("vec_load_lanes",
5017 vec_load_lanes_optab,
5018 vectype, count);
5021 /* Function vect_permute_load_chain.
5023 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5024 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5025 the input data correctly. Return the final references for loads in
5026 RESULT_CHAIN.
5028 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5029 The input is 4 vectors each containing 8 elements. We assign a number to each
5030 element, the input sequence is:
5032 1st vec: 0 1 2 3 4 5 6 7
5033 2nd vec: 8 9 10 11 12 13 14 15
5034 3rd vec: 16 17 18 19 20 21 22 23
5035 4th vec: 24 25 26 27 28 29 30 31
5037 The output sequence should be:
5039 1st vec: 0 4 8 12 16 20 24 28
5040 2nd vec: 1 5 9 13 17 21 25 29
5041 3rd vec: 2 6 10 14 18 22 26 30
5042 4th vec: 3 7 11 15 19 23 27 31
5044 i.e., the first output vector should contain the first elements of each
5045 interleaving group, etc.
5047 We use extract_even/odd instructions to create such output. The input of
5048 each extract_even/odd operation is two vectors
5049 1st vec 2nd vec
5050 0 1 2 3 4 5 6 7
5052 and the output is the vector of extracted even/odd elements. The output of
5053 extract_even will be: 0 2 4 6
5054 and of extract_odd: 1 3 5 7
5057 The permutation is done in log LENGTH stages. In each stage extract_even
5058 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5059 their order. In our example,
5061 E1: extract_even (1st vec, 2nd vec)
5062 E2: extract_odd (1st vec, 2nd vec)
5063 E3: extract_even (3rd vec, 4th vec)
5064 E4: extract_odd (3rd vec, 4th vec)
5066 The output for the first stage will be:
5068 E1: 0 2 4 6 8 10 12 14
5069 E2: 1 3 5 7 9 11 13 15
5070 E3: 16 18 20 22 24 26 28 30
5071 E4: 17 19 21 23 25 27 29 31
5073 In order to proceed and create the correct sequence for the next stage (or
5074 for the correct output, if the second stage is the last one, as in our
5075 example), we first put the output of extract_even operation and then the
5076 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5077 The input for the second stage is:
5079 1st vec (E1): 0 2 4 6 8 10 12 14
5080 2nd vec (E3): 16 18 20 22 24 26 28 30
5081 3rd vec (E2): 1 3 5 7 9 11 13 15
5082 4th vec (E4): 17 19 21 23 25 27 29 31
5084 The output of the second stage:
5086 E1: 0 4 8 12 16 20 24 28
5087 E2: 2 6 10 14 18 22 26 30
5088 E3: 1 5 9 13 17 21 25 29
5089 E4: 3 7 11 15 19 23 27 31
5091 And RESULT_CHAIN after reordering:
5093 1st vec (E1): 0 4 8 12 16 20 24 28
5094 2nd vec (E3): 1 5 9 13 17 21 25 29
5095 3rd vec (E2): 2 6 10 14 18 22 26 30
5096 4th vec (E4): 3 7 11 15 19 23 27 31. */
5098 static void
5099 vect_permute_load_chain (vec<tree> dr_chain,
5100 unsigned int length,
5101 gimple *stmt,
5102 gimple_stmt_iterator *gsi,
5103 vec<tree> *result_chain)
5105 tree data_ref, first_vect, second_vect;
5106 tree perm_mask_even, perm_mask_odd;
5107 tree perm3_mask_low, perm3_mask_high;
5108 gimple *perm_stmt;
5109 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5110 unsigned int i, j, log_length = exact_log2 (length);
5111 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5112 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5114 result_chain->quick_grow (length);
5115 memcpy (result_chain->address (), dr_chain.address (),
5116 length * sizeof (tree));
5118 if (length == 3)
5120 unsigned int k;
5122 for (k = 0; k < 3; k++)
5124 for (i = 0; i < nelt; i++)
5125 if (3 * i + k < 2 * nelt)
5126 sel[i] = 3 * i + k;
5127 else
5128 sel[i] = 0;
5129 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5131 for (i = 0, j = 0; i < nelt; i++)
5132 if (3 * i + k < 2 * nelt)
5133 sel[i] = i;
5134 else
5135 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5137 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5139 first_vect = dr_chain[0];
5140 second_vect = dr_chain[1];
5142 /* Create interleaving stmt (low part of):
5143 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5144 ...}> */
5145 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5146 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5147 second_vect, perm3_mask_low);
5148 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5150 /* Create interleaving stmt (high part of):
5151 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5152 ...}> */
5153 first_vect = data_ref;
5154 second_vect = dr_chain[2];
5155 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5156 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5157 second_vect, perm3_mask_high);
5158 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5159 (*result_chain)[k] = data_ref;
5162 else
5164 /* If length is not equal to 3 then only power of 2 is supported. */
5165 gcc_assert (pow2p_hwi (length));
5167 for (i = 0; i < nelt; ++i)
5168 sel[i] = i * 2;
5169 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5171 for (i = 0; i < nelt; ++i)
5172 sel[i] = i * 2 + 1;
5173 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5175 for (i = 0; i < log_length; i++)
5177 for (j = 0; j < length; j += 2)
5179 first_vect = dr_chain[j];
5180 second_vect = dr_chain[j+1];
5182 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5183 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5184 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5185 first_vect, second_vect,
5186 perm_mask_even);
5187 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5188 (*result_chain)[j/2] = data_ref;
5190 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5191 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5192 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5193 first_vect, second_vect,
5194 perm_mask_odd);
5195 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5196 (*result_chain)[j/2+length/2] = data_ref;
5198 memcpy (dr_chain.address (), result_chain->address (),
5199 length * sizeof (tree));
5204 /* Function vect_shift_permute_load_chain.
5206 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5207 sequence of stmts to reorder the input data accordingly.
5208 Return the final references for loads in RESULT_CHAIN.
5209 Return true if successed, false otherwise.
5211 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5212 The input is 3 vectors each containing 8 elements. We assign a
5213 number to each element, the input sequence is:
5215 1st vec: 0 1 2 3 4 5 6 7
5216 2nd vec: 8 9 10 11 12 13 14 15
5217 3rd vec: 16 17 18 19 20 21 22 23
5219 The output sequence should be:
5221 1st vec: 0 3 6 9 12 15 18 21
5222 2nd vec: 1 4 7 10 13 16 19 22
5223 3rd vec: 2 5 8 11 14 17 20 23
5225 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5227 First we shuffle all 3 vectors to get correct elements order:
5229 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5230 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5231 3rd vec: (16 19 22) (17 20 23) (18 21)
5233 Next we unite and shift vector 3 times:
5235 1st step:
5236 shift right by 6 the concatenation of:
5237 "1st vec" and "2nd vec"
5238 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5239 "2nd vec" and "3rd vec"
5240 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5241 "3rd vec" and "1st vec"
5242 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5243 | New vectors |
5245 So that now new vectors are:
5247 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5248 2nd vec: (10 13) (16 19 22) (17 20 23)
5249 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5251 2nd step:
5252 shift right by 5 the concatenation of:
5253 "1st vec" and "3rd vec"
5254 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5255 "2nd vec" and "1st vec"
5256 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5257 "3rd vec" and "2nd vec"
5258 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5259 | New vectors |
5261 So that now new vectors are:
5263 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5264 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5265 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5267 3rd step:
5268 shift right by 5 the concatenation of:
5269 "1st vec" and "1st vec"
5270 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5271 shift right by 3 the concatenation of:
5272 "2nd vec" and "2nd vec"
5273 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5274 | New vectors |
5276 So that now all vectors are READY:
5277 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5278 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5279 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5281 This algorithm is faster than one in vect_permute_load_chain if:
5282 1. "shift of a concatination" is faster than general permutation.
5283 This is usually so.
5284 2. The TARGET machine can't execute vector instructions in parallel.
5285 This is because each step of the algorithm depends on previous.
5286 The algorithm in vect_permute_load_chain is much more parallel.
5288 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5291 static bool
5292 vect_shift_permute_load_chain (vec<tree> dr_chain,
5293 unsigned int length,
5294 gimple *stmt,
5295 gimple_stmt_iterator *gsi,
5296 vec<tree> *result_chain)
5298 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5299 tree perm2_mask1, perm2_mask2, perm3_mask;
5300 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5301 gimple *perm_stmt;
5303 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5304 unsigned int i;
5305 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5306 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5307 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5308 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5310 result_chain->quick_grow (length);
5311 memcpy (result_chain->address (), dr_chain.address (),
5312 length * sizeof (tree));
5314 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5316 unsigned int j, log_length = exact_log2 (length);
5317 for (i = 0; i < nelt / 2; ++i)
5318 sel[i] = i * 2;
5319 for (i = 0; i < nelt / 2; ++i)
5320 sel[nelt / 2 + i] = i * 2 + 1;
5321 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5323 if (dump_enabled_p ())
5324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5325 "shuffle of 2 fields structure is not \
5326 supported by target\n");
5327 return false;
5329 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5331 for (i = 0; i < nelt / 2; ++i)
5332 sel[i] = i * 2 + 1;
5333 for (i = 0; i < nelt / 2; ++i)
5334 sel[nelt / 2 + i] = i * 2;
5335 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5337 if (dump_enabled_p ())
5338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5339 "shuffle of 2 fields structure is not \
5340 supported by target\n");
5341 return false;
5343 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5345 /* Generating permutation constant to shift all elements.
5346 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5347 for (i = 0; i < nelt; i++)
5348 sel[i] = nelt / 2 + i;
5349 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5351 if (dump_enabled_p ())
5352 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5353 "shift permutation is not supported by target\n");
5354 return false;
5356 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5358 /* Generating permutation constant to select vector from 2.
5359 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5360 for (i = 0; i < nelt / 2; i++)
5361 sel[i] = i;
5362 for (i = nelt / 2; i < nelt; i++)
5363 sel[i] = nelt + i;
5364 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5366 if (dump_enabled_p ())
5367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5368 "select is not supported by target\n");
5369 return false;
5371 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5373 for (i = 0; i < log_length; i++)
5375 for (j = 0; j < length; j += 2)
5377 first_vect = dr_chain[j];
5378 second_vect = dr_chain[j + 1];
5380 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5381 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5382 first_vect, first_vect,
5383 perm2_mask1);
5384 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5385 vect[0] = data_ref;
5387 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5388 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5389 second_vect, second_vect,
5390 perm2_mask2);
5391 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5392 vect[1] = data_ref;
5394 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5395 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5396 vect[0], vect[1], shift1_mask);
5397 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5398 (*result_chain)[j/2 + length/2] = data_ref;
5400 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5401 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5402 vect[0], vect[1], select_mask);
5403 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5404 (*result_chain)[j/2] = data_ref;
5406 memcpy (dr_chain.address (), result_chain->address (),
5407 length * sizeof (tree));
5409 return true;
5411 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5413 unsigned int k = 0, l = 0;
5415 /* Generating permutation constant to get all elements in rigth order.
5416 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5417 for (i = 0; i < nelt; i++)
5419 if (3 * k + (l % 3) >= nelt)
5421 k = 0;
5422 l += (3 - (nelt % 3));
5424 sel[i] = 3 * k + (l % 3);
5425 k++;
5427 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5429 if (dump_enabled_p ())
5430 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5431 "shuffle of 3 fields structure is not \
5432 supported by target\n");
5433 return false;
5435 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5437 /* Generating permutation constant to shift all elements.
5438 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5439 for (i = 0; i < nelt; i++)
5440 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5441 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5443 if (dump_enabled_p ())
5444 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5445 "shift permutation is not supported by target\n");
5446 return false;
5448 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5450 /* Generating permutation constant to shift all elements.
5451 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5452 for (i = 0; i < nelt; i++)
5453 sel[i] = 2 * (nelt / 3) + 1 + i;
5454 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5456 if (dump_enabled_p ())
5457 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5458 "shift permutation is not supported by target\n");
5459 return false;
5461 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5463 /* Generating permutation constant to shift all elements.
5464 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5465 for (i = 0; i < nelt; i++)
5466 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5467 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5469 if (dump_enabled_p ())
5470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5471 "shift permutation is not supported by target\n");
5472 return false;
5474 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5476 /* Generating permutation constant to shift all elements.
5477 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5478 for (i = 0; i < nelt; i++)
5479 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5480 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5482 if (dump_enabled_p ())
5483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5484 "shift permutation is not supported by target\n");
5485 return false;
5487 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5489 for (k = 0; k < 3; k++)
5491 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5492 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5493 dr_chain[k], dr_chain[k],
5494 perm3_mask);
5495 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5496 vect[k] = data_ref;
5499 for (k = 0; k < 3; k++)
5501 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5502 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5503 vect[k % 3], vect[(k + 1) % 3],
5504 shift1_mask);
5505 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5506 vect_shift[k] = data_ref;
5509 for (k = 0; k < 3; k++)
5511 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5512 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5513 vect_shift[(4 - k) % 3],
5514 vect_shift[(3 - k) % 3],
5515 shift2_mask);
5516 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5517 vect[k] = data_ref;
5520 (*result_chain)[3 - (nelt % 3)] = vect[2];
5522 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5523 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5524 vect[0], shift3_mask);
5525 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5526 (*result_chain)[nelt % 3] = data_ref;
5528 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5529 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5530 vect[1], shift4_mask);
5531 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5532 (*result_chain)[0] = data_ref;
5533 return true;
5535 return false;
5538 /* Function vect_transform_grouped_load.
5540 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5541 to perform their permutation and ascribe the result vectorized statements to
5542 the scalar statements.
5545 void
5546 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5547 gimple_stmt_iterator *gsi)
5549 machine_mode mode;
5550 vec<tree> result_chain = vNULL;
5552 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5553 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5554 vectors, that are ready for vector computation. */
5555 result_chain.create (size);
5557 /* If reassociation width for vector type is 2 or greater target machine can
5558 execute 2 or more vector instructions in parallel. Otherwise try to
5559 get chain for loads group using vect_shift_permute_load_chain. */
5560 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5561 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5562 || pow2p_hwi (size)
5563 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5564 gsi, &result_chain))
5565 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5566 vect_record_grouped_load_vectors (stmt, result_chain);
5567 result_chain.release ();
5570 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5571 generated as part of the vectorization of STMT. Assign the statement
5572 for each vector to the associated scalar statement. */
5574 void
5575 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5577 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5578 gimple *next_stmt, *new_stmt;
5579 unsigned int i, gap_count;
5580 tree tmp_data_ref;
5582 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5583 Since we scan the chain starting from it's first node, their order
5584 corresponds the order of data-refs in RESULT_CHAIN. */
5585 next_stmt = first_stmt;
5586 gap_count = 1;
5587 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5589 if (!next_stmt)
5590 break;
5592 /* Skip the gaps. Loads created for the gaps will be removed by dead
5593 code elimination pass later. No need to check for the first stmt in
5594 the group, since it always exists.
5595 GROUP_GAP is the number of steps in elements from the previous
5596 access (if there is no gap GROUP_GAP is 1). We skip loads that
5597 correspond to the gaps. */
5598 if (next_stmt != first_stmt
5599 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5601 gap_count++;
5602 continue;
5605 while (next_stmt)
5607 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5608 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5609 copies, and we put the new vector statement in the first available
5610 RELATED_STMT. */
5611 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5612 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5613 else
5615 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5617 gimple *prev_stmt =
5618 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5619 gimple *rel_stmt =
5620 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5621 while (rel_stmt)
5623 prev_stmt = rel_stmt;
5624 rel_stmt =
5625 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5628 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5629 new_stmt;
5633 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5634 gap_count = 1;
5635 /* If NEXT_STMT accesses the same DR as the previous statement,
5636 put the same TMP_DATA_REF as its vectorized statement; otherwise
5637 get the next data-ref from RESULT_CHAIN. */
5638 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5639 break;
5644 /* Function vect_force_dr_alignment_p.
5646 Returns whether the alignment of a DECL can be forced to be aligned
5647 on ALIGNMENT bit boundary. */
5649 bool
5650 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5652 if (!VAR_P (decl))
5653 return false;
5655 if (decl_in_symtab_p (decl)
5656 && !symtab_node::get (decl)->can_increase_alignment_p ())
5657 return false;
5659 if (TREE_STATIC (decl))
5660 return (alignment <= MAX_OFILE_ALIGNMENT);
5661 else
5662 return (alignment <= MAX_STACK_ALIGNMENT);
5666 /* Return whether the data reference DR is supported with respect to its
5667 alignment.
5668 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5669 it is aligned, i.e., check if it is possible to vectorize it with different
5670 alignment. */
5672 enum dr_alignment_support
5673 vect_supportable_dr_alignment (struct data_reference *dr,
5674 bool check_aligned_accesses)
5676 gimple *stmt = DR_STMT (dr);
5677 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5678 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5679 machine_mode mode = TYPE_MODE (vectype);
5680 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5681 struct loop *vect_loop = NULL;
5682 bool nested_in_vect_loop = false;
5684 if (aligned_access_p (dr) && !check_aligned_accesses)
5685 return dr_aligned;
5687 /* For now assume all conditional loads/stores support unaligned
5688 access without any special code. */
5689 if (is_gimple_call (stmt)
5690 && gimple_call_internal_p (stmt)
5691 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5692 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5693 return dr_unaligned_supported;
5695 if (loop_vinfo)
5697 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5698 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5701 /* Possibly unaligned access. */
5703 /* We can choose between using the implicit realignment scheme (generating
5704 a misaligned_move stmt) and the explicit realignment scheme (generating
5705 aligned loads with a REALIGN_LOAD). There are two variants to the
5706 explicit realignment scheme: optimized, and unoptimized.
5707 We can optimize the realignment only if the step between consecutive
5708 vector loads is equal to the vector size. Since the vector memory
5709 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5710 is guaranteed that the misalignment amount remains the same throughout the
5711 execution of the vectorized loop. Therefore, we can create the
5712 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5713 at the loop preheader.
5715 However, in the case of outer-loop vectorization, when vectorizing a
5716 memory access in the inner-loop nested within the LOOP that is now being
5717 vectorized, while it is guaranteed that the misalignment of the
5718 vectorized memory access will remain the same in different outer-loop
5719 iterations, it is *not* guaranteed that is will remain the same throughout
5720 the execution of the inner-loop. This is because the inner-loop advances
5721 with the original scalar step (and not in steps of VS). If the inner-loop
5722 step happens to be a multiple of VS, then the misalignment remains fixed
5723 and we can use the optimized realignment scheme. For example:
5725 for (i=0; i<N; i++)
5726 for (j=0; j<M; j++)
5727 s += a[i+j];
5729 When vectorizing the i-loop in the above example, the step between
5730 consecutive vector loads is 1, and so the misalignment does not remain
5731 fixed across the execution of the inner-loop, and the realignment cannot
5732 be optimized (as illustrated in the following pseudo vectorized loop):
5734 for (i=0; i<N; i+=4)
5735 for (j=0; j<M; j++){
5736 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5737 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5738 // (assuming that we start from an aligned address).
5741 We therefore have to use the unoptimized realignment scheme:
5743 for (i=0; i<N; i+=4)
5744 for (j=k; j<M; j+=4)
5745 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5746 // that the misalignment of the initial address is
5747 // 0).
5749 The loop can then be vectorized as follows:
5751 for (k=0; k<4; k++){
5752 rt = get_realignment_token (&vp[k]);
5753 for (i=0; i<N; i+=4){
5754 v1 = vp[i+k];
5755 for (j=k; j<M; j+=4){
5756 v2 = vp[i+j+VS-1];
5757 va = REALIGN_LOAD <v1,v2,rt>;
5758 vs += va;
5759 v1 = v2;
5762 } */
5764 if (DR_IS_READ (dr))
5766 bool is_packed = false;
5767 tree type = (TREE_TYPE (DR_REF (dr)));
5769 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5770 && (!targetm.vectorize.builtin_mask_for_load
5771 || targetm.vectorize.builtin_mask_for_load ()))
5773 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5775 /* If we are doing SLP then the accesses need not have the
5776 same alignment, instead it depends on the SLP group size. */
5777 if (loop_vinfo
5778 && STMT_SLP_TYPE (stmt_info)
5779 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5780 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5781 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5783 else if (!loop_vinfo
5784 || (nested_in_vect_loop
5785 && (TREE_INT_CST_LOW (DR_STEP (dr))
5786 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5787 return dr_explicit_realign;
5788 else
5789 return dr_explicit_realign_optimized;
5791 if (!known_alignment_for_access_p (dr))
5792 is_packed = not_size_aligned (DR_REF (dr));
5794 if (targetm.vectorize.support_vector_misalignment
5795 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5796 /* Can't software pipeline the loads, but can at least do them. */
5797 return dr_unaligned_supported;
5799 else
5801 bool is_packed = false;
5802 tree type = (TREE_TYPE (DR_REF (dr)));
5804 if (!known_alignment_for_access_p (dr))
5805 is_packed = not_size_aligned (DR_REF (dr));
5807 if (targetm.vectorize.support_vector_misalignment
5808 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5809 return dr_unaligned_supported;
5812 /* Unsupported. */
5813 return dr_unaligned_unsupported;