Small ChangeLog tweak.
[official-gcc.git] / gcc / tree-vect-data-refs.c
bloba1ef24ed1d51e98f00e9f6842c4d4526d39e31c0
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
54 /* Return true if load- or store-lanes optab OPTAB is implemented for
55 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
57 static bool
58 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
59 tree vectype, unsigned HOST_WIDE_INT count)
61 machine_mode mode, array_mode;
62 bool limit_p;
64 mode = TYPE_MODE (vectype);
65 limit_p = !targetm.array_mode_supported_p (mode, count);
66 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
67 MODE_INT, limit_p);
69 if (array_mode == BLKmode)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
74 GET_MODE_NAME (mode), count);
75 return false;
78 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
80 if (dump_enabled_p ())
81 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
82 "cannot use %s<%s><%s>\n", name,
83 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
84 return false;
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_NOTE, vect_location,
89 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
90 GET_MODE_NAME (mode));
92 return true;
96 /* Return the smallest scalar part of STMT.
97 This is used to determine the vectype of the stmt. We generally set the
98 vectype according to the type of the result (lhs). For stmts whose
99 result-type is different than the type of the arguments (e.g., demotion,
100 promotion), vectype will be reset appropriately (later). Note that we have
101 to visit the smallest datatype in this function, because that determines the
102 VF. If the smallest datatype in the loop is present only as the rhs of a
103 promotion operation - we'd miss it.
104 Such a case, where a variable of this datatype does not appear in the lhs
105 anywhere in the loop, can only occur if it's an invariant: e.g.:
106 'int_x = (int) short_inv', which we'd expect to have been optimized away by
107 invariant motion. However, we cannot rely on invariant motion to always
108 take invariants out of the loop, and so in the case of promotion we also
109 have to check the rhs.
110 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
111 types. */
113 tree
114 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
115 HOST_WIDE_INT *rhs_size_unit)
117 tree scalar_type = gimple_expr_type (stmt);
118 HOST_WIDE_INT lhs, rhs;
120 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
122 if (is_gimple_assign (stmt)
123 && (gimple_assign_cast_p (stmt)
124 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
125 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
126 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
128 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
130 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
131 if (rhs < lhs)
132 scalar_type = rhs_type;
135 *lhs_size_unit = lhs;
136 *rhs_size_unit = rhs;
137 return scalar_type;
141 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
142 tested at run-time. Return TRUE if DDR was successfully inserted.
143 Return false if versioning is not supported. */
145 static bool
146 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
148 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
150 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
151 return false;
153 if (dump_enabled_p ())
155 dump_printf_loc (MSG_NOTE, vect_location,
156 "mark for run-time aliasing test between ");
157 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
158 dump_printf (MSG_NOTE, " and ");
159 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
160 dump_printf (MSG_NOTE, "\n");
163 if (optimize_loop_nest_for_size_p (loop))
165 if (dump_enabled_p ())
166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
167 "versioning not supported when optimizing"
168 " for size.\n");
169 return false;
172 /* FORNOW: We don't support versioning with outer-loop vectorization. */
173 if (loop->inner)
175 if (dump_enabled_p ())
176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
177 "versioning not yet supported for outer-loops.\n");
178 return false;
181 /* FORNOW: We don't support creating runtime alias tests for non-constant
182 step. */
183 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
184 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
186 if (dump_enabled_p ())
187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
188 "versioning not yet supported for non-constant "
189 "step\n");
190 return false;
193 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
194 return true;
198 /* Function vect_analyze_data_ref_dependence.
200 Return TRUE if there (might) exist a dependence between a memory-reference
201 DRA and a memory-reference DRB. When versioning for alias may check a
202 dependence at run-time, return FALSE. Adjust *MAX_VF according to
203 the data dependence. */
205 static bool
206 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
207 loop_vec_info loop_vinfo, int *max_vf)
209 unsigned int i;
210 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
211 struct data_reference *dra = DDR_A (ddr);
212 struct data_reference *drb = DDR_B (ddr);
213 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
214 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
215 lambda_vector dist_v;
216 unsigned int loop_depth;
218 /* In loop analysis all data references should be vectorizable. */
219 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
220 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
221 gcc_unreachable ();
223 /* Independent data accesses. */
224 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
225 return false;
227 if (dra == drb
228 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
229 return false;
231 /* We do not have to consider dependences between accesses that belong
232 to the same group. */
233 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
234 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
235 return false;
237 /* Even if we have an anti-dependence then, as the vectorized loop covers at
238 least two scalar iterations, there is always also a true dependence.
239 As the vectorizer does not re-order loads and stores we can ignore
240 the anti-dependence if TBAA can disambiguate both DRs similar to the
241 case with known negative distance anti-dependences (positive
242 distance anti-dependences would violate TBAA constraints). */
243 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
244 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
245 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
246 get_alias_set (DR_REF (drb))))
247 return false;
249 /* Unknown data dependence. */
250 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
252 /* If user asserted safelen consecutive iterations can be
253 executed concurrently, assume independence. */
254 if (loop->safelen >= 2)
256 if (loop->safelen < *max_vf)
257 *max_vf = loop->safelen;
258 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
259 return false;
262 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
263 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
265 if (dump_enabled_p ())
267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
268 "versioning for alias not supported for: "
269 "can't determine dependence between ");
270 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
271 DR_REF (dra));
272 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
273 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
274 DR_REF (drb));
275 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
277 return true;
280 if (dump_enabled_p ())
282 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
283 "versioning for alias required: "
284 "can't determine dependence between ");
285 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
286 DR_REF (dra));
287 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
288 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
289 DR_REF (drb));
290 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
293 /* Add to list of ddrs that need to be tested at run-time. */
294 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
297 /* Known data dependence. */
298 if (DDR_NUM_DIST_VECTS (ddr) == 0)
300 /* If user asserted safelen consecutive iterations can be
301 executed concurrently, assume independence. */
302 if (loop->safelen >= 2)
304 if (loop->safelen < *max_vf)
305 *max_vf = loop->safelen;
306 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
307 return false;
310 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
311 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
313 if (dump_enabled_p ())
315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
316 "versioning for alias not supported for: "
317 "bad dist vector for ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
319 DR_REF (dra));
320 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
321 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
322 DR_REF (drb));
323 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
325 return true;
328 if (dump_enabled_p ())
330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
331 "versioning for alias required: "
332 "bad dist vector for ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
334 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
335 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
336 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
338 /* Add to list of ddrs that need to be tested at run-time. */
339 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
342 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
343 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
345 int dist = dist_v[loop_depth];
347 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location,
349 "dependence distance = %d.\n", dist);
351 if (dist == 0)
353 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE, vect_location,
356 "dependence distance == 0 between ");
357 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
358 dump_printf (MSG_NOTE, " and ");
359 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
360 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
363 /* When we perform grouped accesses and perform implicit CSE
364 by detecting equal accesses and doing disambiguation with
365 runtime alias tests like for
366 .. = a[i];
367 .. = a[i+1];
368 a[i] = ..;
369 a[i+1] = ..;
370 *p = ..;
371 .. = a[i];
372 .. = a[i+1];
373 where we will end up loading { a[i], a[i+1] } once, make
374 sure that inserting group loads before the first load and
375 stores after the last store will do the right thing.
376 Similar for groups like
377 a[i] = ...;
378 ... = a[i];
379 a[i+1] = ...;
380 where loads from the group interleave with the store. */
381 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
382 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
384 gimple *earlier_stmt;
385 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
386 if (DR_IS_WRITE
387 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
391 "READ_WRITE dependence in interleaving."
392 "\n");
393 return true;
397 continue;
400 if (dist > 0 && DDR_REVERSED_P (ddr))
402 /* If DDR_REVERSED_P the order of the data-refs in DDR was
403 reversed (to make distance vector positive), and the actual
404 distance is negative. */
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "dependence distance negative.\n");
408 /* Record a negative dependence distance to later limit the
409 amount of stmt copying / unrolling we can perform.
410 Only need to handle read-after-write dependence. */
411 if (DR_IS_READ (drb)
412 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
413 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
414 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
415 continue;
418 if (abs (dist) >= 2
419 && abs (dist) < *max_vf)
421 /* The dependence distance requires reduction of the maximal
422 vectorization factor. */
423 *max_vf = abs (dist);
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_NOTE, vect_location,
426 "adjusting maximal vectorization factor to %i\n",
427 *max_vf);
430 if (abs (dist) >= *max_vf)
432 /* Dependence distance does not create dependence, as far as
433 vectorization is concerned, in this case. */
434 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE, vect_location,
436 "dependence distance >= VF.\n");
437 continue;
440 if (dump_enabled_p ())
442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
443 "not vectorized, possible dependence "
444 "between data-refs ");
445 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
446 dump_printf (MSG_NOTE, " and ");
447 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
448 dump_printf (MSG_NOTE, "\n");
451 return true;
454 return false;
457 /* Function vect_analyze_data_ref_dependences.
459 Examine all the data references in the loop, and make sure there do not
460 exist any data dependences between them. Set *MAX_VF according to
461 the maximum vectorization factor the data dependences allow. */
463 bool
464 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
466 unsigned int i;
467 struct data_dependence_relation *ddr;
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE, vect_location,
471 "=== vect_analyze_data_ref_dependences ===\n");
473 LOOP_VINFO_DDRS (loop_vinfo)
474 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
475 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
476 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
477 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
478 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
479 &LOOP_VINFO_DDRS (loop_vinfo),
480 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
481 return false;
483 /* For epilogues we either have no aliases or alias versioning
484 was applied to original loop. Therefore we may just get max_vf
485 using VF of original loop. */
486 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
487 *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
488 else
489 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
490 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
491 return false;
493 return true;
497 /* Function vect_slp_analyze_data_ref_dependence.
499 Return TRUE if there (might) exist a dependence between a memory-reference
500 DRA and a memory-reference DRB. When versioning for alias may check a
501 dependence at run-time, return FALSE. Adjust *MAX_VF according to
502 the data dependence. */
504 static bool
505 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
507 struct data_reference *dra = DDR_A (ddr);
508 struct data_reference *drb = DDR_B (ddr);
510 /* We need to check dependences of statements marked as unvectorizable
511 as well, they still can prohibit vectorization. */
513 /* Independent data accesses. */
514 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
515 return false;
517 if (dra == drb)
518 return false;
520 /* Read-read is OK. */
521 if (DR_IS_READ (dra) && DR_IS_READ (drb))
522 return false;
524 /* If dra and drb are part of the same interleaving chain consider
525 them independent. */
526 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
527 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
528 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
529 return false;
531 /* Unknown data dependence. */
532 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
534 if (dump_enabled_p ())
536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
537 "can't determine dependence between ");
538 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
539 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
541 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
544 else if (dump_enabled_p ())
546 dump_printf_loc (MSG_NOTE, vect_location,
547 "determined dependence between ");
548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
549 dump_printf (MSG_NOTE, " and ");
550 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
551 dump_printf (MSG_NOTE, "\n");
554 return true;
558 /* Analyze dependences involved in the transform of SLP NODE. STORES
559 contain the vector of scalar stores of this instance if we are
560 disambiguating the loads. */
562 static bool
563 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
564 vec<gimple *> stores, gimple *last_store)
566 /* This walks over all stmts involved in the SLP load/store done
567 in NODE verifying we can sink them up to the last stmt in the
568 group. */
569 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
570 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
572 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
573 if (access == last_access)
574 continue;
575 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
576 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
577 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
579 gimple *stmt = gsi_stmt (gsi);
580 if (! gimple_vuse (stmt)
581 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
582 continue;
584 /* If we couldn't record a (single) data reference for this
585 stmt we have to give up. */
586 /* ??? Here and below if dependence analysis fails we can resort
587 to the alias oracle which can handle more kinds of stmts. */
588 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
589 if (!dr_b)
590 return false;
592 bool dependent = false;
593 /* If we run into a store of this same instance (we've just
594 marked those) then delay dependence checking until we run
595 into the last store because this is where it will have
596 been sunk to (and we verify if we can do that as well). */
597 if (gimple_visited_p (stmt))
599 if (stmt != last_store)
600 continue;
601 unsigned i;
602 gimple *store;
603 FOR_EACH_VEC_ELT (stores, i, store)
605 data_reference *store_dr
606 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
607 ddr_p ddr = initialize_data_dependence_relation
608 (dr_a, store_dr, vNULL);
609 dependent = vect_slp_analyze_data_ref_dependence (ddr);
610 free_dependence_relation (ddr);
611 if (dependent)
612 break;
615 else
617 ddr_p ddr = initialize_data_dependence_relation (dr_a,
618 dr_b, vNULL);
619 dependent = vect_slp_analyze_data_ref_dependence (ddr);
620 free_dependence_relation (ddr);
622 if (dependent)
623 return false;
626 return true;
630 /* Function vect_analyze_data_ref_dependences.
632 Examine all the data references in the basic-block, and make sure there
633 do not exist any data dependences between them. Set *MAX_VF according to
634 the maximum vectorization factor the data dependences allow. */
636 bool
637 vect_slp_analyze_instance_dependence (slp_instance instance)
639 if (dump_enabled_p ())
640 dump_printf_loc (MSG_NOTE, vect_location,
641 "=== vect_slp_analyze_instance_dependence ===\n");
643 /* The stores of this instance are at the root of the SLP tree. */
644 slp_tree store = SLP_INSTANCE_TREE (instance);
645 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
646 store = NULL;
648 /* Verify we can sink stores to the vectorized stmt insert location. */
649 gimple *last_store = NULL;
650 if (store)
652 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
653 return false;
655 /* Mark stores in this instance and remember the last one. */
656 last_store = vect_find_last_scalar_stmt_in_slp (store);
657 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
658 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
661 bool res = true;
663 /* Verify we can sink loads to the vectorized stmt insert location,
664 special-casing stores of this instance. */
665 slp_tree load;
666 unsigned int i;
667 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
668 if (! vect_slp_analyze_node_dependences (instance, load,
669 store
670 ? SLP_TREE_SCALAR_STMTS (store)
671 : vNULL, last_store))
673 res = false;
674 break;
677 /* Unset the visited flag. */
678 if (store)
679 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
680 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
682 return res;
685 /* Function vect_compute_data_ref_alignment
687 Compute the misalignment of the data reference DR.
689 Output:
690 1. If during the misalignment computation it is found that the data reference
691 cannot be vectorized then false is returned.
692 2. DR_MISALIGNMENT (DR) is defined.
694 FOR NOW: No analysis is actually performed. Misalignment is calculated
695 only for trivial cases. TODO. */
697 bool
698 vect_compute_data_ref_alignment (struct data_reference *dr)
700 gimple *stmt = DR_STMT (dr);
701 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
702 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
703 struct loop *loop = NULL;
704 tree ref = DR_REF (dr);
705 tree vectype;
706 tree base, base_addr;
707 tree misalign = NULL_TREE;
708 tree aligned_to;
709 tree step;
710 unsigned HOST_WIDE_INT alignment;
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE, vect_location,
714 "vect_compute_data_ref_alignment:\n");
716 if (loop_vinfo)
717 loop = LOOP_VINFO_LOOP (loop_vinfo);
719 /* Initialize misalignment to unknown. */
720 SET_DR_MISALIGNMENT (dr, -1);
722 if (tree_fits_shwi_p (DR_STEP (dr)))
723 misalign = DR_INIT (dr);
724 aligned_to = DR_ALIGNED_TO (dr);
725 base_addr = DR_BASE_ADDRESS (dr);
726 vectype = STMT_VINFO_VECTYPE (stmt_info);
728 /* In case the dataref is in an inner-loop of the loop that is being
729 vectorized (LOOP), we use the base and misalignment information
730 relative to the outer-loop (LOOP). This is ok only if the misalignment
731 stays the same throughout the execution of the inner-loop, which is why
732 we have to check that the stride of the dataref in the inner-loop evenly
733 divides by the vector size. */
734 if (loop && nested_in_vect_loop_p (loop, stmt))
736 tree step = DR_STEP (dr);
738 if (tree_fits_shwi_p (step)
739 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_NOTE, vect_location,
743 "inner step divides the vector-size.\n");
744 misalign = STMT_VINFO_DR_INIT (stmt_info);
745 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
746 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
748 else
750 if (dump_enabled_p ())
751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
752 "inner step doesn't divide the vector-size.\n");
753 misalign = NULL_TREE;
757 /* Similarly we can only use base and misalignment information relative to
758 an innermost loop if the misalignment stays the same throughout the
759 execution of the loop. As above, this is the case if the stride of
760 the dataref evenly divides by the vector size. */
761 else
763 tree step = DR_STEP (dr);
764 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
766 if (tree_fits_shwi_p (step)
767 && ((tree_to_shwi (step) * vf)
768 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
770 if (dump_enabled_p ())
771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
772 "step doesn't divide the vector-size.\n");
773 misalign = NULL_TREE;
777 /* To look at alignment of the base we have to preserve an inner MEM_REF
778 as that carries alignment information of the actual access. */
779 base = ref;
780 while (handled_component_p (base))
781 base = TREE_OPERAND (base, 0);
782 unsigned int base_alignment = 0;
783 unsigned HOST_WIDE_INT base_bitpos;
784 get_object_alignment_1 (base, &base_alignment, &base_bitpos);
785 /* As data-ref analysis strips the MEM_REF down to its base operand
786 to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to
787 adjust things to make base_alignment valid as the alignment of
788 DR_BASE_ADDRESS. */
789 if (TREE_CODE (base) == MEM_REF)
791 /* Note all this only works if DR_BASE_ADDRESS is the same as
792 MEM_REF operand zero, otherwise DR/SCEV analysis might have factored
793 in other offsets. We need to rework DR to compute the alingment
794 of DR_BASE_ADDRESS as long as all information is still available. */
795 if (operand_equal_p (TREE_OPERAND (base, 0), base_addr, 0))
797 base_bitpos -= mem_ref_offset (base).to_short_addr () * BITS_PER_UNIT;
798 base_bitpos &= (base_alignment - 1);
800 else
801 base_bitpos = BITS_PER_UNIT;
803 if (base_bitpos != 0)
804 base_alignment = base_bitpos & -base_bitpos;
805 /* Also look at the alignment of the base address DR analysis
806 computed. */
807 unsigned int base_addr_alignment = get_pointer_alignment (base_addr);
808 if (base_addr_alignment > base_alignment)
809 base_alignment = base_addr_alignment;
811 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
812 DR_VECT_AUX (dr)->base_element_aligned = true;
814 alignment = TYPE_ALIGN_UNIT (vectype);
816 if ((compare_tree_int (aligned_to, alignment) < 0)
817 || !misalign)
819 if (dump_enabled_p ())
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 "Unknown alignment for access: ");
823 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
824 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
826 return true;
829 if (base_alignment < TYPE_ALIGN (vectype))
831 base = base_addr;
832 if (TREE_CODE (base) == ADDR_EXPR)
833 base = TREE_OPERAND (base, 0);
834 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
836 if (dump_enabled_p ())
838 dump_printf_loc (MSG_NOTE, vect_location,
839 "can't force alignment of ref: ");
840 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
841 dump_printf (MSG_NOTE, "\n");
843 return true;
846 if (DECL_USER_ALIGN (base))
848 if (dump_enabled_p ())
850 dump_printf_loc (MSG_NOTE, vect_location,
851 "not forcing alignment of user-aligned "
852 "variable: ");
853 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
854 dump_printf (MSG_NOTE, "\n");
856 return true;
859 /* Force the alignment of the decl.
860 NOTE: This is the only change to the code we make during
861 the analysis phase, before deciding to vectorize the loop. */
862 if (dump_enabled_p ())
864 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
865 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
866 dump_printf (MSG_NOTE, "\n");
869 DR_VECT_AUX (dr)->base_decl = base;
870 DR_VECT_AUX (dr)->base_misaligned = true;
871 DR_VECT_AUX (dr)->base_element_aligned = true;
874 if (loop && nested_in_vect_loop_p (loop, stmt))
875 step = STMT_VINFO_DR_STEP (stmt_info);
876 else
877 step = DR_STEP (dr);
878 /* If this is a backward running DR then first access in the larger
879 vectype actually is N-1 elements before the address in the DR.
880 Adjust misalign accordingly. */
881 if (tree_int_cst_sgn (step) < 0)
883 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
884 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
885 otherwise we wouldn't be here. */
886 offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
887 /* PLUS because STEP was negative. */
888 misalign = size_binop (PLUS_EXPR, misalign, offset);
891 SET_DR_MISALIGNMENT (dr,
892 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
894 if (dump_enabled_p ())
896 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
897 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
898 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
899 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
902 return true;
906 /* Function vect_update_misalignment_for_peel
908 DR - the data reference whose misalignment is to be adjusted.
909 DR_PEEL - the data reference whose misalignment is being made
910 zero in the vector loop by the peel.
911 NPEEL - the number of iterations in the peel loop if the misalignment
912 of DR_PEEL is known at compile time. */
914 static void
915 vect_update_misalignment_for_peel (struct data_reference *dr,
916 struct data_reference *dr_peel, int npeel)
918 unsigned int i;
919 vec<dr_p> same_align_drs;
920 struct data_reference *current_dr;
921 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
922 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
923 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
924 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
926 /* For interleaved data accesses the step in the loop must be multiplied by
927 the size of the interleaving group. */
928 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
929 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
930 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
931 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
933 /* It can be assumed that the data refs with the same alignment as dr_peel
934 are aligned in the vector loop. */
935 same_align_drs
936 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
937 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
939 if (current_dr != dr)
940 continue;
941 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
942 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
943 SET_DR_MISALIGNMENT (dr, 0);
944 return;
947 if (known_alignment_for_access_p (dr)
948 && known_alignment_for_access_p (dr_peel))
950 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
951 int misal = DR_MISALIGNMENT (dr);
952 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
953 misal += negative ? -npeel * dr_size : npeel * dr_size;
954 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
955 SET_DR_MISALIGNMENT (dr, misal);
956 return;
959 if (dump_enabled_p ())
960 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
961 SET_DR_MISALIGNMENT (dr, -1);
965 /* Function verify_data_ref_alignment
967 Return TRUE if DR can be handled with respect to alignment. */
969 static bool
970 verify_data_ref_alignment (data_reference_p dr)
972 enum dr_alignment_support supportable_dr_alignment
973 = vect_supportable_dr_alignment (dr, false);
974 if (!supportable_dr_alignment)
976 if (dump_enabled_p ())
978 if (DR_IS_READ (dr))
979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
980 "not vectorized: unsupported unaligned load.");
981 else
982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
983 "not vectorized: unsupported unaligned "
984 "store.");
986 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
987 DR_REF (dr));
988 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
990 return false;
993 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
994 dump_printf_loc (MSG_NOTE, vect_location,
995 "Vectorizing an unaligned access.\n");
997 return true;
1000 /* Function vect_verify_datarefs_alignment
1002 Return TRUE if all data references in the loop can be
1003 handled with respect to alignment. */
1005 bool
1006 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1008 vec<data_reference_p> datarefs = vinfo->datarefs;
1009 struct data_reference *dr;
1010 unsigned int i;
1012 FOR_EACH_VEC_ELT (datarefs, i, dr)
1014 gimple *stmt = DR_STMT (dr);
1015 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1017 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1018 continue;
1020 /* For interleaving, only the alignment of the first access matters. */
1021 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1022 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1023 continue;
1025 /* Strided accesses perform only component accesses, alignment is
1026 irrelevant for them. */
1027 if (STMT_VINFO_STRIDED_P (stmt_info)
1028 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1029 continue;
1031 if (! verify_data_ref_alignment (dr))
1032 return false;
1035 return true;
1038 /* Given an memory reference EXP return whether its alignment is less
1039 than its size. */
1041 static bool
1042 not_size_aligned (tree exp)
1044 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1045 return true;
1047 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1048 > get_object_alignment (exp));
1051 /* Function vector_alignment_reachable_p
1053 Return true if vector alignment for DR is reachable by peeling
1054 a few loop iterations. Return false otherwise. */
1056 static bool
1057 vector_alignment_reachable_p (struct data_reference *dr)
1059 gimple *stmt = DR_STMT (dr);
1060 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1061 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1063 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1065 /* For interleaved access we peel only if number of iterations in
1066 the prolog loop ({VF - misalignment}), is a multiple of the
1067 number of the interleaved accesses. */
1068 int elem_size, mis_in_elements;
1069 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1071 /* FORNOW: handle only known alignment. */
1072 if (!known_alignment_for_access_p (dr))
1073 return false;
1075 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1076 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1078 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1079 return false;
1082 /* If misalignment is known at the compile time then allow peeling
1083 only if natural alignment is reachable through peeling. */
1084 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1086 HOST_WIDE_INT elmsize =
1087 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1088 if (dump_enabled_p ())
1090 dump_printf_loc (MSG_NOTE, vect_location,
1091 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1092 dump_printf (MSG_NOTE,
1093 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1095 if (DR_MISALIGNMENT (dr) % elmsize)
1097 if (dump_enabled_p ())
1098 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1099 "data size does not divide the misalignment.\n");
1100 return false;
1104 if (!known_alignment_for_access_p (dr))
1106 tree type = TREE_TYPE (DR_REF (dr));
1107 bool is_packed = not_size_aligned (DR_REF (dr));
1108 if (dump_enabled_p ())
1109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1110 "Unknown misalignment, %snaturally aligned\n",
1111 is_packed ? "not " : "");
1112 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1115 return true;
1119 /* Calculate the cost of the memory access represented by DR. */
1121 static void
1122 vect_get_data_access_cost (struct data_reference *dr,
1123 unsigned int *inside_cost,
1124 unsigned int *outside_cost,
1125 stmt_vector_for_cost *body_cost_vec)
1127 gimple *stmt = DR_STMT (dr);
1128 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1129 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1130 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1131 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1132 int ncopies = vf / nunits;
1134 if (DR_IS_READ (dr))
1135 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1136 NULL, body_cost_vec, false);
1137 else
1138 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1140 if (dump_enabled_p ())
1141 dump_printf_loc (MSG_NOTE, vect_location,
1142 "vect_get_data_access_cost: inside_cost = %d, "
1143 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1147 typedef struct _vect_peel_info
1149 struct data_reference *dr;
1150 int npeel;
1151 unsigned int count;
1152 } *vect_peel_info;
1154 typedef struct _vect_peel_extended_info
1156 struct _vect_peel_info peel_info;
1157 unsigned int inside_cost;
1158 unsigned int outside_cost;
1159 stmt_vector_for_cost body_cost_vec;
1160 } *vect_peel_extended_info;
1163 /* Peeling hashtable helpers. */
1165 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1167 static inline hashval_t hash (const _vect_peel_info *);
1168 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1171 inline hashval_t
1172 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1174 return (hashval_t) peel_info->npeel;
1177 inline bool
1178 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1180 return (a->npeel == b->npeel);
1184 /* Insert DR into peeling hash table with NPEEL as key. */
1186 static void
1187 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1188 loop_vec_info loop_vinfo, struct data_reference *dr,
1189 int npeel)
1191 struct _vect_peel_info elem, *slot;
1192 _vect_peel_info **new_slot;
1193 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1195 elem.npeel = npeel;
1196 slot = peeling_htab->find (&elem);
1197 if (slot)
1198 slot->count++;
1199 else
1201 slot = XNEW (struct _vect_peel_info);
1202 slot->npeel = npeel;
1203 slot->dr = dr;
1204 slot->count = 1;
1205 new_slot = peeling_htab->find_slot (slot, INSERT);
1206 *new_slot = slot;
1209 if (!supportable_dr_alignment
1210 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1211 slot->count += VECT_MAX_COST;
1215 /* Traverse peeling hash table to find peeling option that aligns maximum
1216 number of data accesses. */
1219 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1220 _vect_peel_extended_info *max)
1222 vect_peel_info elem = *slot;
1224 if (elem->count > max->peel_info.count
1225 || (elem->count == max->peel_info.count
1226 && max->peel_info.npeel > elem->npeel))
1228 max->peel_info.npeel = elem->npeel;
1229 max->peel_info.count = elem->count;
1230 max->peel_info.dr = elem->dr;
1233 return 1;
1237 /* Traverse peeling hash table and calculate cost for each peeling option.
1238 Find the one with the lowest cost. */
1241 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1242 _vect_peel_extended_info *min)
1244 vect_peel_info elem = *slot;
1245 int save_misalignment, dummy;
1246 unsigned int inside_cost = 0, outside_cost = 0, i;
1247 gimple *stmt = DR_STMT (elem->dr);
1248 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1249 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1250 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1251 struct data_reference *dr;
1252 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1254 prologue_cost_vec.create (2);
1255 body_cost_vec.create (2);
1256 epilogue_cost_vec.create (2);
1258 FOR_EACH_VEC_ELT (datarefs, i, dr)
1260 stmt = DR_STMT (dr);
1261 stmt_info = vinfo_for_stmt (stmt);
1262 /* For interleaving, only the alignment of the first access
1263 matters. */
1264 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1265 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1266 continue;
1268 /* Strided accesses perform only component accesses, alignment is
1269 irrelevant for them. */
1270 if (STMT_VINFO_STRIDED_P (stmt_info)
1271 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1272 continue;
1274 save_misalignment = DR_MISALIGNMENT (dr);
1275 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1276 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1277 &body_cost_vec);
1278 SET_DR_MISALIGNMENT (dr, save_misalignment);
1281 outside_cost += vect_get_known_peeling_cost
1282 (loop_vinfo, elem->npeel, &dummy,
1283 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1284 &prologue_cost_vec, &epilogue_cost_vec);
1286 /* Prologue and epilogue costs are added to the target model later.
1287 These costs depend only on the scalar iteration cost, the
1288 number of peeling iterations finally chosen, and the number of
1289 misaligned statements. So discard the information found here. */
1290 prologue_cost_vec.release ();
1291 epilogue_cost_vec.release ();
1293 if (inside_cost < min->inside_cost
1294 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1296 min->inside_cost = inside_cost;
1297 min->outside_cost = outside_cost;
1298 min->body_cost_vec.release ();
1299 min->body_cost_vec = body_cost_vec;
1300 min->peel_info.dr = elem->dr;
1301 min->peel_info.npeel = elem->npeel;
1303 else
1304 body_cost_vec.release ();
1306 return 1;
1310 /* Choose best peeling option by traversing peeling hash table and either
1311 choosing an option with the lowest cost (if cost model is enabled) or the
1312 option that aligns as many accesses as possible. */
1314 static struct data_reference *
1315 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1316 loop_vec_info loop_vinfo,
1317 unsigned int *npeel,
1318 stmt_vector_for_cost *body_cost_vec)
1320 struct _vect_peel_extended_info res;
1322 res.peel_info.dr = NULL;
1323 res.body_cost_vec = stmt_vector_for_cost ();
1325 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1327 res.inside_cost = INT_MAX;
1328 res.outside_cost = INT_MAX;
1329 peeling_htab->traverse <_vect_peel_extended_info *,
1330 vect_peeling_hash_get_lowest_cost> (&res);
1332 else
1334 res.peel_info.count = 0;
1335 peeling_htab->traverse <_vect_peel_extended_info *,
1336 vect_peeling_hash_get_most_frequent> (&res);
1339 *npeel = res.peel_info.npeel;
1340 *body_cost_vec = res.body_cost_vec;
1341 return res.peel_info.dr;
1345 /* Function vect_enhance_data_refs_alignment
1347 This pass will use loop versioning and loop peeling in order to enhance
1348 the alignment of data references in the loop.
1350 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1351 original loop is to be vectorized. Any other loops that are created by
1352 the transformations performed in this pass - are not supposed to be
1353 vectorized. This restriction will be relaxed.
1355 This pass will require a cost model to guide it whether to apply peeling
1356 or versioning or a combination of the two. For example, the scheme that
1357 intel uses when given a loop with several memory accesses, is as follows:
1358 choose one memory access ('p') which alignment you want to force by doing
1359 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1360 other accesses are not necessarily aligned, or (2) use loop versioning to
1361 generate one loop in which all accesses are aligned, and another loop in
1362 which only 'p' is necessarily aligned.
1364 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1365 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1366 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1368 Devising a cost model is the most critical aspect of this work. It will
1369 guide us on which access to peel for, whether to use loop versioning, how
1370 many versions to create, etc. The cost model will probably consist of
1371 generic considerations as well as target specific considerations (on
1372 powerpc for example, misaligned stores are more painful than misaligned
1373 loads).
1375 Here are the general steps involved in alignment enhancements:
1377 -- original loop, before alignment analysis:
1378 for (i=0; i<N; i++){
1379 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1380 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1383 -- After vect_compute_data_refs_alignment:
1384 for (i=0; i<N; i++){
1385 x = q[i]; # DR_MISALIGNMENT(q) = 3
1386 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1389 -- Possibility 1: we do loop versioning:
1390 if (p is aligned) {
1391 for (i=0; i<N; i++){ # loop 1A
1392 x = q[i]; # DR_MISALIGNMENT(q) = 3
1393 p[i] = y; # DR_MISALIGNMENT(p) = 0
1396 else {
1397 for (i=0; i<N; i++){ # loop 1B
1398 x = q[i]; # DR_MISALIGNMENT(q) = 3
1399 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1403 -- Possibility 2: we do loop peeling:
1404 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1405 x = q[i];
1406 p[i] = y;
1408 for (i = 3; i < N; i++){ # loop 2A
1409 x = q[i]; # DR_MISALIGNMENT(q) = 0
1410 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1413 -- Possibility 3: combination of loop peeling and versioning:
1414 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1415 x = q[i];
1416 p[i] = y;
1418 if (p is aligned) {
1419 for (i = 3; i<N; i++){ # loop 3A
1420 x = q[i]; # DR_MISALIGNMENT(q) = 0
1421 p[i] = y; # DR_MISALIGNMENT(p) = 0
1424 else {
1425 for (i = 3; i<N; i++){ # loop 3B
1426 x = q[i]; # DR_MISALIGNMENT(q) = 0
1427 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1431 These loops are later passed to loop_transform to be vectorized. The
1432 vectorizer will use the alignment information to guide the transformation
1433 (whether to generate regular loads/stores, or with special handling for
1434 misalignment). */
1436 bool
1437 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1439 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1440 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1441 enum dr_alignment_support supportable_dr_alignment;
1442 struct data_reference *dr0 = NULL, *first_store = NULL;
1443 struct data_reference *dr;
1444 unsigned int i, j;
1445 bool do_peeling = false;
1446 bool do_versioning = false;
1447 bool stat;
1448 gimple *stmt;
1449 stmt_vec_info stmt_info;
1450 unsigned int npeel = 0;
1451 bool all_misalignments_unknown = true;
1452 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1453 unsigned possible_npeel_number = 1;
1454 tree vectype;
1455 unsigned int nelements, mis, same_align_drs_max = 0;
1456 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1457 hash_table<peel_info_hasher> peeling_htab (1);
1459 if (dump_enabled_p ())
1460 dump_printf_loc (MSG_NOTE, vect_location,
1461 "=== vect_enhance_data_refs_alignment ===\n");
1463 /* Reset data so we can safely be called multiple times. */
1464 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1465 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1467 /* While cost model enhancements are expected in the future, the high level
1468 view of the code at this time is as follows:
1470 A) If there is a misaligned access then see if peeling to align
1471 this access can make all data references satisfy
1472 vect_supportable_dr_alignment. If so, update data structures
1473 as needed and return true.
1475 B) If peeling wasn't possible and there is a data reference with an
1476 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1477 then see if loop versioning checks can be used to make all data
1478 references satisfy vect_supportable_dr_alignment. If so, update
1479 data structures as needed and return true.
1481 C) If neither peeling nor versioning were successful then return false if
1482 any data reference does not satisfy vect_supportable_dr_alignment.
1484 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1486 Note, Possibility 3 above (which is peeling and versioning together) is not
1487 being done at this time. */
1489 /* (1) Peeling to force alignment. */
1491 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1492 Considerations:
1493 + How many accesses will become aligned due to the peeling
1494 - How many accesses will become unaligned due to the peeling,
1495 and the cost of misaligned accesses.
1496 - The cost of peeling (the extra runtime checks, the increase
1497 in code size). */
1499 FOR_EACH_VEC_ELT (datarefs, i, dr)
1501 stmt = DR_STMT (dr);
1502 stmt_info = vinfo_for_stmt (stmt);
1504 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1505 continue;
1507 /* For interleaving, only the alignment of the first access
1508 matters. */
1509 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1510 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1511 continue;
1513 /* For invariant accesses there is nothing to enhance. */
1514 if (integer_zerop (DR_STEP (dr)))
1515 continue;
1517 /* Strided accesses perform only component accesses, alignment is
1518 irrelevant for them. */
1519 if (STMT_VINFO_STRIDED_P (stmt_info)
1520 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1521 continue;
1523 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1524 do_peeling = vector_alignment_reachable_p (dr);
1525 if (do_peeling)
1527 if (known_alignment_for_access_p (dr))
1529 unsigned int npeel_tmp;
1530 bool negative = tree_int_cst_compare (DR_STEP (dr),
1531 size_zero_node) < 0;
1533 /* Save info about DR in the hash table. */
1534 vectype = STMT_VINFO_VECTYPE (stmt_info);
1535 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1536 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1537 TREE_TYPE (DR_REF (dr))));
1538 npeel_tmp = (negative
1539 ? (mis - nelements) : (nelements - mis))
1540 & (nelements - 1);
1542 /* For multiple types, it is possible that the bigger type access
1543 will have more than one peeling option. E.g., a loop with two
1544 types: one of size (vector size / 4), and the other one of
1545 size (vector size / 8). Vectorization factor will 8. If both
1546 access are misaligned by 3, the first one needs one scalar
1547 iteration to be aligned, and the second one needs 5. But the
1548 first one will be aligned also by peeling 5 scalar
1549 iterations, and in that case both accesses will be aligned.
1550 Hence, except for the immediate peeling amount, we also want
1551 to try to add full vector size, while we don't exceed
1552 vectorization factor.
1553 We do this automatically for cost model, since we calculate cost
1554 for every peeling option. */
1555 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1557 if (STMT_SLP_TYPE (stmt_info))
1558 possible_npeel_number
1559 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1560 else
1561 possible_npeel_number = vf / nelements;
1564 /* Handle the aligned case. We may decide to align some other
1565 access, making DR unaligned. */
1566 if (DR_MISALIGNMENT (dr) == 0)
1568 npeel_tmp = 0;
1569 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1570 possible_npeel_number++;
1573 for (j = 0; j < possible_npeel_number; j++)
1575 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1576 dr, npeel_tmp);
1577 npeel_tmp += nelements;
1580 all_misalignments_unknown = false;
1581 /* Data-ref that was chosen for the case that all the
1582 misalignments are unknown is not relevant anymore, since we
1583 have a data-ref with known alignment. */
1584 dr0 = NULL;
1586 else
1588 /* If we don't know any misalignment values, we prefer
1589 peeling for data-ref that has the maximum number of data-refs
1590 with the same alignment, unless the target prefers to align
1591 stores over load. */
1592 if (all_misalignments_unknown)
1594 unsigned same_align_drs
1595 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1596 if (!dr0
1597 || same_align_drs_max < same_align_drs)
1599 same_align_drs_max = same_align_drs;
1600 dr0 = dr;
1602 /* For data-refs with the same number of related
1603 accesses prefer the one where the misalign
1604 computation will be invariant in the outermost loop. */
1605 else if (same_align_drs_max == same_align_drs)
1607 struct loop *ivloop0, *ivloop;
1608 ivloop0 = outermost_invariant_loop_for_expr
1609 (loop, DR_BASE_ADDRESS (dr0));
1610 ivloop = outermost_invariant_loop_for_expr
1611 (loop, DR_BASE_ADDRESS (dr));
1612 if ((ivloop && !ivloop0)
1613 || (ivloop && ivloop0
1614 && flow_loop_nested_p (ivloop, ivloop0)))
1615 dr0 = dr;
1618 if (!first_store && DR_IS_WRITE (dr))
1619 first_store = dr;
1622 /* If there are both known and unknown misaligned accesses in the
1623 loop, we choose peeling amount according to the known
1624 accesses. */
1625 if (!supportable_dr_alignment)
1627 dr0 = dr;
1628 if (!first_store && DR_IS_WRITE (dr))
1629 first_store = dr;
1633 else
1635 if (!aligned_access_p (dr))
1637 if (dump_enabled_p ())
1638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639 "vector alignment may not be reachable\n");
1640 break;
1645 /* Check if we can possibly peel the loop. */
1646 if (!vect_can_advance_ivs_p (loop_vinfo)
1647 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1648 || loop->inner)
1649 do_peeling = false;
1651 if (do_peeling
1652 && all_misalignments_unknown
1653 && vect_supportable_dr_alignment (dr0, false))
1655 /* Check if the target requires to prefer stores over loads, i.e., if
1656 misaligned stores are more expensive than misaligned loads (taking
1657 drs with same alignment into account). */
1658 if (first_store && DR_IS_READ (dr0))
1660 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1661 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1662 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1663 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1664 stmt_vector_for_cost dummy;
1665 dummy.create (2);
1667 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1668 &dummy);
1669 vect_get_data_access_cost (first_store, &store_inside_cost,
1670 &store_outside_cost, &dummy);
1672 dummy.release ();
1674 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1675 aligning the load DR0). */
1676 load_inside_penalty = store_inside_cost;
1677 load_outside_penalty = store_outside_cost;
1678 for (i = 0;
1679 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1680 DR_STMT (first_store))).iterate (i, &dr);
1681 i++)
1682 if (DR_IS_READ (dr))
1684 load_inside_penalty += load_inside_cost;
1685 load_outside_penalty += load_outside_cost;
1687 else
1689 load_inside_penalty += store_inside_cost;
1690 load_outside_penalty += store_outside_cost;
1693 /* Calculate the penalty for leaving DR0 unaligned (by
1694 aligning the FIRST_STORE). */
1695 store_inside_penalty = load_inside_cost;
1696 store_outside_penalty = load_outside_cost;
1697 for (i = 0;
1698 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1699 DR_STMT (dr0))).iterate (i, &dr);
1700 i++)
1701 if (DR_IS_READ (dr))
1703 store_inside_penalty += load_inside_cost;
1704 store_outside_penalty += load_outside_cost;
1706 else
1708 store_inside_penalty += store_inside_cost;
1709 store_outside_penalty += store_outside_cost;
1712 if (load_inside_penalty > store_inside_penalty
1713 || (load_inside_penalty == store_inside_penalty
1714 && load_outside_penalty > store_outside_penalty))
1715 dr0 = first_store;
1718 /* Use peeling only if it may help to align other accesses in the loop or
1719 if it may help improving load bandwith when we'd end up using
1720 unaligned loads. */
1721 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1722 if (STMT_VINFO_SAME_ALIGN_REFS
1723 (vinfo_for_stmt (DR_STMT (dr0))).length () == 0
1724 && (vect_supportable_dr_alignment (dr0, false)
1725 != dr_unaligned_supported
1726 || (DR_IS_READ (dr0)
1727 && (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1728 == builtin_vectorization_cost (unaligned_load,
1729 dr0_vt, -1)))))
1730 do_peeling = false;
1733 if (do_peeling && !dr0)
1735 /* Peeling is possible, but there is no data access that is not supported
1736 unless aligned. So we try to choose the best possible peeling. */
1738 /* We should get here only if there are drs with known misalignment. */
1739 gcc_assert (!all_misalignments_unknown);
1741 /* Choose the best peeling from the hash table. */
1742 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1743 loop_vinfo, &npeel,
1744 &body_cost_vec);
1745 if (!dr0 || !npeel)
1746 do_peeling = false;
1749 if (do_peeling)
1751 stmt = DR_STMT (dr0);
1752 stmt_info = vinfo_for_stmt (stmt);
1753 vectype = STMT_VINFO_VECTYPE (stmt_info);
1754 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1756 if (known_alignment_for_access_p (dr0))
1758 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1759 size_zero_node) < 0;
1760 if (!npeel)
1762 /* Since it's known at compile time, compute the number of
1763 iterations in the peeled loop (the peeling factor) for use in
1764 updating DR_MISALIGNMENT values. The peeling factor is the
1765 vectorization factor minus the misalignment as an element
1766 count. */
1767 mis = DR_MISALIGNMENT (dr0);
1768 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1769 npeel = ((negative ? mis - nelements : nelements - mis)
1770 & (nelements - 1));
1773 /* For interleaved data access every iteration accesses all the
1774 members of the group, therefore we divide the number of iterations
1775 by the group size. */
1776 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1777 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1778 npeel /= GROUP_SIZE (stmt_info);
1780 if (dump_enabled_p ())
1781 dump_printf_loc (MSG_NOTE, vect_location,
1782 "Try peeling by %d\n", npeel);
1785 /* Ensure that all data refs can be vectorized after the peel. */
1786 FOR_EACH_VEC_ELT (datarefs, i, dr)
1788 int save_misalignment;
1790 if (dr == dr0)
1791 continue;
1793 stmt = DR_STMT (dr);
1794 stmt_info = vinfo_for_stmt (stmt);
1795 /* For interleaving, only the alignment of the first access
1796 matters. */
1797 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1798 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1799 continue;
1801 /* Strided accesses perform only component accesses, alignment is
1802 irrelevant for them. */
1803 if (STMT_VINFO_STRIDED_P (stmt_info)
1804 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1805 continue;
1807 save_misalignment = DR_MISALIGNMENT (dr);
1808 vect_update_misalignment_for_peel (dr, dr0, npeel);
1809 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1810 SET_DR_MISALIGNMENT (dr, save_misalignment);
1812 if (!supportable_dr_alignment)
1814 do_peeling = false;
1815 break;
1819 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1821 stat = vect_verify_datarefs_alignment (loop_vinfo);
1822 if (!stat)
1823 do_peeling = false;
1824 else
1826 body_cost_vec.release ();
1827 return stat;
1831 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1832 if (do_peeling)
1834 unsigned max_allowed_peel
1835 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1836 if (max_allowed_peel != (unsigned)-1)
1838 unsigned max_peel = npeel;
1839 if (max_peel == 0)
1841 gimple *dr_stmt = DR_STMT (dr0);
1842 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1843 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1844 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1846 if (max_peel > max_allowed_peel)
1848 do_peeling = false;
1849 if (dump_enabled_p ())
1850 dump_printf_loc (MSG_NOTE, vect_location,
1851 "Disable peeling, max peels reached: %d\n", max_peel);
1856 /* Cost model #2 - if peeling may result in a remaining loop not
1857 iterating enough to be vectorized then do not peel. */
1858 if (do_peeling
1859 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1861 unsigned max_peel
1862 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1863 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1864 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1865 do_peeling = false;
1868 if (do_peeling)
1870 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1871 If the misalignment of DR_i is identical to that of dr0 then set
1872 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1873 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1874 by the peeling factor times the element size of DR_i (MOD the
1875 vectorization factor times the size). Otherwise, the
1876 misalignment of DR_i must be set to unknown. */
1877 FOR_EACH_VEC_ELT (datarefs, i, dr)
1878 if (dr != dr0)
1880 /* Strided accesses perform only component accesses, alignment
1881 is irrelevant for them. */
1882 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1883 if (STMT_VINFO_STRIDED_P (stmt_info)
1884 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1885 continue;
1887 vect_update_misalignment_for_peel (dr, dr0, npeel);
1890 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1891 if (npeel)
1892 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1893 else
1894 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1895 = DR_MISALIGNMENT (dr0);
1896 SET_DR_MISALIGNMENT (dr0, 0);
1897 if (dump_enabled_p ())
1899 dump_printf_loc (MSG_NOTE, vect_location,
1900 "Alignment of access forced using peeling.\n");
1901 dump_printf_loc (MSG_NOTE, vect_location,
1902 "Peeling for alignment will be applied.\n");
1904 /* The inside-loop cost will be accounted for in vectorizable_load
1905 and vectorizable_store correctly with adjusted alignments.
1906 Drop the body_cst_vec on the floor here. */
1907 body_cost_vec.release ();
1909 stat = vect_verify_datarefs_alignment (loop_vinfo);
1910 gcc_assert (stat);
1911 return stat;
1915 body_cost_vec.release ();
1917 /* (2) Versioning to force alignment. */
1919 /* Try versioning if:
1920 1) optimize loop for speed
1921 2) there is at least one unsupported misaligned data ref with an unknown
1922 misalignment, and
1923 3) all misaligned data refs with a known misalignment are supported, and
1924 4) the number of runtime alignment checks is within reason. */
1926 do_versioning =
1927 optimize_loop_nest_for_speed_p (loop)
1928 && (!loop->inner); /* FORNOW */
1930 if (do_versioning)
1932 FOR_EACH_VEC_ELT (datarefs, i, dr)
1934 stmt = DR_STMT (dr);
1935 stmt_info = vinfo_for_stmt (stmt);
1937 /* For interleaving, only the alignment of the first access
1938 matters. */
1939 if (aligned_access_p (dr)
1940 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1941 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1942 continue;
1944 if (STMT_VINFO_STRIDED_P (stmt_info))
1946 /* Strided loads perform only component accesses, alignment is
1947 irrelevant for them. */
1948 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1949 continue;
1950 do_versioning = false;
1951 break;
1954 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1956 if (!supportable_dr_alignment)
1958 gimple *stmt;
1959 int mask;
1960 tree vectype;
1962 if (known_alignment_for_access_p (dr)
1963 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1964 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1966 do_versioning = false;
1967 break;
1970 stmt = DR_STMT (dr);
1971 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1972 gcc_assert (vectype);
1974 /* The rightmost bits of an aligned address must be zeros.
1975 Construct the mask needed for this test. For example,
1976 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1977 mask must be 15 = 0xf. */
1978 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1980 /* FORNOW: use the same mask to test all potentially unaligned
1981 references in the loop. The vectorizer currently supports
1982 a single vector size, see the reference to
1983 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1984 vectorization factor is computed. */
1985 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1986 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1987 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1988 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1989 DR_STMT (dr));
1993 /* Versioning requires at least one misaligned data reference. */
1994 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1995 do_versioning = false;
1996 else if (!do_versioning)
1997 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2000 if (do_versioning)
2002 vec<gimple *> may_misalign_stmts
2003 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2004 gimple *stmt;
2006 /* It can now be assumed that the data references in the statements
2007 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2008 of the loop being vectorized. */
2009 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2011 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2012 dr = STMT_VINFO_DATA_REF (stmt_info);
2013 SET_DR_MISALIGNMENT (dr, 0);
2014 if (dump_enabled_p ())
2015 dump_printf_loc (MSG_NOTE, vect_location,
2016 "Alignment of access forced using versioning.\n");
2019 if (dump_enabled_p ())
2020 dump_printf_loc (MSG_NOTE, vect_location,
2021 "Versioning for alignment will be applied.\n");
2023 /* Peeling and versioning can't be done together at this time. */
2024 gcc_assert (! (do_peeling && do_versioning));
2026 stat = vect_verify_datarefs_alignment (loop_vinfo);
2027 gcc_assert (stat);
2028 return stat;
2031 /* This point is reached if neither peeling nor versioning is being done. */
2032 gcc_assert (! (do_peeling || do_versioning));
2034 stat = vect_verify_datarefs_alignment (loop_vinfo);
2035 return stat;
2039 /* Function vect_find_same_alignment_drs.
2041 Update group and alignment relations according to the chosen
2042 vectorization factor. */
2044 static void
2045 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2046 loop_vec_info loop_vinfo)
2048 unsigned int i;
2049 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2050 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2051 struct data_reference *dra = DDR_A (ddr);
2052 struct data_reference *drb = DDR_B (ddr);
2053 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2054 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2055 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2056 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2057 lambda_vector dist_v;
2058 unsigned int loop_depth;
2060 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2061 return;
2063 if (dra == drb)
2064 return;
2066 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2067 return;
2069 /* Loop-based vectorization and known data dependence. */
2070 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2071 return;
2073 /* Data-dependence analysis reports a distance vector of zero
2074 for data-references that overlap only in the first iteration
2075 but have different sign step (see PR45764).
2076 So as a sanity check require equal DR_STEP. */
2077 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2078 return;
2080 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2081 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2083 int dist = dist_v[loop_depth];
2085 if (dump_enabled_p ())
2086 dump_printf_loc (MSG_NOTE, vect_location,
2087 "dependence distance = %d.\n", dist);
2089 /* Same loop iteration. */
2090 if (dist == 0
2091 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2093 /* Two references with distance zero have the same alignment. */
2094 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2095 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2096 if (dump_enabled_p ())
2098 dump_printf_loc (MSG_NOTE, vect_location,
2099 "accesses have the same alignment.\n");
2100 dump_printf (MSG_NOTE,
2101 "dependence distance modulo vf == 0 between ");
2102 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2103 dump_printf (MSG_NOTE, " and ");
2104 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2105 dump_printf (MSG_NOTE, "\n");
2112 /* Function vect_analyze_data_refs_alignment
2114 Analyze the alignment of the data-references in the loop.
2115 Return FALSE if a data reference is found that cannot be vectorized. */
2117 bool
2118 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2120 if (dump_enabled_p ())
2121 dump_printf_loc (MSG_NOTE, vect_location,
2122 "=== vect_analyze_data_refs_alignment ===\n");
2124 /* Mark groups of data references with same alignment using
2125 data dependence information. */
2126 vec<ddr_p> ddrs = vinfo->ddrs;
2127 struct data_dependence_relation *ddr;
2128 unsigned int i;
2130 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2131 vect_find_same_alignment_drs (ddr, vinfo);
2133 vec<data_reference_p> datarefs = vinfo->datarefs;
2134 struct data_reference *dr;
2136 FOR_EACH_VEC_ELT (datarefs, i, dr)
2138 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2139 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2140 && !vect_compute_data_ref_alignment (dr))
2142 /* Strided accesses perform only component accesses, misalignment
2143 information is irrelevant for them. */
2144 if (STMT_VINFO_STRIDED_P (stmt_info)
2145 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2146 continue;
2148 if (dump_enabled_p ())
2149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2150 "not vectorized: can't calculate alignment "
2151 "for data ref.\n");
2153 return false;
2157 return true;
2161 /* Analyze alignment of DRs of stmts in NODE. */
2163 static bool
2164 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2166 /* We vectorize from the first scalar stmt in the node unless
2167 the node is permuted in which case we start from the first
2168 element in the group. */
2169 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2170 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2171 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2172 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2174 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2175 if (! vect_compute_data_ref_alignment (dr)
2176 /* For creating the data-ref pointer we need alignment of the
2177 first element anyway. */
2178 || (dr != first_dr
2179 && ! vect_compute_data_ref_alignment (first_dr))
2180 || ! verify_data_ref_alignment (dr))
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184 "not vectorized: bad data alignment in basic "
2185 "block.\n");
2186 return false;
2189 return true;
2192 /* Function vect_slp_analyze_instance_alignment
2194 Analyze the alignment of the data-references in the SLP instance.
2195 Return FALSE if a data reference is found that cannot be vectorized. */
2197 bool
2198 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2200 if (dump_enabled_p ())
2201 dump_printf_loc (MSG_NOTE, vect_location,
2202 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2204 slp_tree node;
2205 unsigned i;
2206 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2207 if (! vect_slp_analyze_and_verify_node_alignment (node))
2208 return false;
2210 node = SLP_INSTANCE_TREE (instance);
2211 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2212 && ! vect_slp_analyze_and_verify_node_alignment
2213 (SLP_INSTANCE_TREE (instance)))
2214 return false;
2216 return true;
2220 /* Analyze groups of accesses: check that DR belongs to a group of
2221 accesses of legal size, step, etc. Detect gaps, single element
2222 interleaving, and other special cases. Set grouped access info.
2223 Collect groups of strided stores for further use in SLP analysis.
2224 Worker for vect_analyze_group_access. */
2226 static bool
2227 vect_analyze_group_access_1 (struct data_reference *dr)
2229 tree step = DR_STEP (dr);
2230 tree scalar_type = TREE_TYPE (DR_REF (dr));
2231 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2232 gimple *stmt = DR_STMT (dr);
2233 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2234 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2235 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2236 HOST_WIDE_INT dr_step = -1;
2237 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2238 bool slp_impossible = false;
2240 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2241 size of the interleaving group (including gaps). */
2242 if (tree_fits_shwi_p (step))
2244 dr_step = tree_to_shwi (step);
2245 /* Check that STEP is a multiple of type size. Otherwise there is
2246 a non-element-sized gap at the end of the group which we
2247 cannot represent in GROUP_GAP or GROUP_SIZE.
2248 ??? As we can handle non-constant step fine here we should
2249 simply remove uses of GROUP_GAP between the last and first
2250 element and instead rely on DR_STEP. GROUP_SIZE then would
2251 simply not include that gap. */
2252 if ((dr_step % type_size) != 0)
2254 if (dump_enabled_p ())
2256 dump_printf_loc (MSG_NOTE, vect_location,
2257 "Step ");
2258 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2259 dump_printf (MSG_NOTE,
2260 " is not a multiple of the element size for ");
2261 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2262 dump_printf (MSG_NOTE, "\n");
2264 return false;
2266 groupsize = absu_hwi (dr_step) / type_size;
2268 else
2269 groupsize = 0;
2271 /* Not consecutive access is possible only if it is a part of interleaving. */
2272 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2274 /* Check if it this DR is a part of interleaving, and is a single
2275 element of the group that is accessed in the loop. */
2277 /* Gaps are supported only for loads. STEP must be a multiple of the type
2278 size. The size of the group must be a power of 2. */
2279 if (DR_IS_READ (dr)
2280 && (dr_step % type_size) == 0
2281 && groupsize > 0
2282 && pow2p_hwi (groupsize))
2284 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2285 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2286 GROUP_GAP (stmt_info) = groupsize - 1;
2287 if (dump_enabled_p ())
2289 dump_printf_loc (MSG_NOTE, vect_location,
2290 "Detected single element interleaving ");
2291 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2292 dump_printf (MSG_NOTE, " step ");
2293 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2294 dump_printf (MSG_NOTE, "\n");
2297 return true;
2300 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2303 "not consecutive access ");
2304 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2307 if (bb_vinfo)
2309 /* Mark the statement as unvectorizable. */
2310 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2311 return true;
2314 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2315 STMT_VINFO_STRIDED_P (stmt_info) = true;
2316 return true;
2319 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2321 /* First stmt in the interleaving chain. Check the chain. */
2322 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2323 struct data_reference *data_ref = dr;
2324 unsigned int count = 1;
2325 tree prev_init = DR_INIT (data_ref);
2326 gimple *prev = stmt;
2327 HOST_WIDE_INT diff, gaps = 0;
2329 while (next)
2331 /* Skip same data-refs. In case that two or more stmts share
2332 data-ref (supported only for loads), we vectorize only the first
2333 stmt, and the rest get their vectorized loads from the first
2334 one. */
2335 if (!tree_int_cst_compare (DR_INIT (data_ref),
2336 DR_INIT (STMT_VINFO_DATA_REF (
2337 vinfo_for_stmt (next)))))
2339 if (DR_IS_WRITE (data_ref))
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343 "Two store stmts share the same dr.\n");
2344 return false;
2347 if (dump_enabled_p ())
2348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2349 "Two or more load stmts share the same dr.\n");
2351 /* For load use the same data-ref load. */
2352 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2354 prev = next;
2355 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2356 continue;
2359 prev = next;
2360 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2362 /* All group members have the same STEP by construction. */
2363 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2365 /* Check that the distance between two accesses is equal to the type
2366 size. Otherwise, we have gaps. */
2367 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2368 - TREE_INT_CST_LOW (prev_init)) / type_size;
2369 if (diff != 1)
2371 /* FORNOW: SLP of accesses with gaps is not supported. */
2372 slp_impossible = true;
2373 if (DR_IS_WRITE (data_ref))
2375 if (dump_enabled_p ())
2376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2377 "interleaved store with gaps\n");
2378 return false;
2381 gaps += diff - 1;
2384 last_accessed_element += diff;
2386 /* Store the gap from the previous member of the group. If there is no
2387 gap in the access, GROUP_GAP is always 1. */
2388 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2390 prev_init = DR_INIT (data_ref);
2391 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2392 /* Count the number of data-refs in the chain. */
2393 count++;
2396 if (groupsize == 0)
2397 groupsize = count + gaps;
2399 /* This could be UINT_MAX but as we are generating code in a very
2400 inefficient way we have to cap earlier. See PR78699 for example. */
2401 if (groupsize > 4096)
2403 if (dump_enabled_p ())
2404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2405 "group is too large\n");
2406 return false;
2409 /* Check that the size of the interleaving is equal to count for stores,
2410 i.e., that there are no gaps. */
2411 if (groupsize != count
2412 && !DR_IS_READ (dr))
2414 if (dump_enabled_p ())
2415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2416 "interleaved store with gaps\n");
2417 return false;
2420 /* If there is a gap after the last load in the group it is the
2421 difference between the groupsize and the last accessed
2422 element.
2423 When there is no gap, this difference should be 0. */
2424 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2426 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2427 if (dump_enabled_p ())
2429 dump_printf_loc (MSG_NOTE, vect_location,
2430 "Detected interleaving ");
2431 if (DR_IS_READ (dr))
2432 dump_printf (MSG_NOTE, "load ");
2433 else
2434 dump_printf (MSG_NOTE, "store ");
2435 dump_printf (MSG_NOTE, "of size %u starting with ",
2436 (unsigned)groupsize);
2437 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2438 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2439 dump_printf_loc (MSG_NOTE, vect_location,
2440 "There is a gap of %u elements after the group\n",
2441 GROUP_GAP (vinfo_for_stmt (stmt)));
2444 /* SLP: create an SLP data structure for every interleaving group of
2445 stores for further analysis in vect_analyse_slp. */
2446 if (DR_IS_WRITE (dr) && !slp_impossible)
2448 if (loop_vinfo)
2449 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2450 if (bb_vinfo)
2451 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2455 return true;
2458 /* Analyze groups of accesses: check that DR belongs to a group of
2459 accesses of legal size, step, etc. Detect gaps, single element
2460 interleaving, and other special cases. Set grouped access info.
2461 Collect groups of strided stores for further use in SLP analysis. */
2463 static bool
2464 vect_analyze_group_access (struct data_reference *dr)
2466 if (!vect_analyze_group_access_1 (dr))
2468 /* Dissolve the group if present. */
2469 gimple *next;
2470 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2471 while (stmt)
2473 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2474 next = GROUP_NEXT_ELEMENT (vinfo);
2475 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2476 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2477 stmt = next;
2479 return false;
2481 return true;
2484 /* Analyze the access pattern of the data-reference DR.
2485 In case of non-consecutive accesses call vect_analyze_group_access() to
2486 analyze groups of accesses. */
2488 static bool
2489 vect_analyze_data_ref_access (struct data_reference *dr)
2491 tree step = DR_STEP (dr);
2492 tree scalar_type = TREE_TYPE (DR_REF (dr));
2493 gimple *stmt = DR_STMT (dr);
2494 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2495 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2496 struct loop *loop = NULL;
2498 if (loop_vinfo)
2499 loop = LOOP_VINFO_LOOP (loop_vinfo);
2501 if (loop_vinfo && !step)
2503 if (dump_enabled_p ())
2504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2505 "bad data-ref access in loop\n");
2506 return false;
2509 /* Allow loads with zero step in inner-loop vectorization. */
2510 if (loop_vinfo && integer_zerop (step))
2512 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2513 if (!nested_in_vect_loop_p (loop, stmt))
2514 return DR_IS_READ (dr);
2515 /* Allow references with zero step for outer loops marked
2516 with pragma omp simd only - it guarantees absence of
2517 loop-carried dependencies between inner loop iterations. */
2518 if (!loop->force_vectorize)
2520 if (dump_enabled_p ())
2521 dump_printf_loc (MSG_NOTE, vect_location,
2522 "zero step in inner loop of nest\n");
2523 return false;
2527 if (loop && nested_in_vect_loop_p (loop, stmt))
2529 /* Interleaved accesses are not yet supported within outer-loop
2530 vectorization for references in the inner-loop. */
2531 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2533 /* For the rest of the analysis we use the outer-loop step. */
2534 step = STMT_VINFO_DR_STEP (stmt_info);
2535 if (integer_zerop (step))
2537 if (dump_enabled_p ())
2538 dump_printf_loc (MSG_NOTE, vect_location,
2539 "zero step in outer loop.\n");
2540 return DR_IS_READ (dr);
2544 /* Consecutive? */
2545 if (TREE_CODE (step) == INTEGER_CST)
2547 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2548 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2549 || (dr_step < 0
2550 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2552 /* Mark that it is not interleaving. */
2553 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2554 return true;
2558 if (loop && nested_in_vect_loop_p (loop, stmt))
2560 if (dump_enabled_p ())
2561 dump_printf_loc (MSG_NOTE, vect_location,
2562 "grouped access in outer loop.\n");
2563 return false;
2567 /* Assume this is a DR handled by non-constant strided load case. */
2568 if (TREE_CODE (step) != INTEGER_CST)
2569 return (STMT_VINFO_STRIDED_P (stmt_info)
2570 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2571 || vect_analyze_group_access (dr)));
2573 /* Not consecutive access - check if it's a part of interleaving group. */
2574 return vect_analyze_group_access (dr);
2577 /* Compare two data-references DRA and DRB to group them into chunks
2578 suitable for grouping. */
2580 static int
2581 dr_group_sort_cmp (const void *dra_, const void *drb_)
2583 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2584 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2585 int cmp;
2587 /* Stabilize sort. */
2588 if (dra == drb)
2589 return 0;
2591 /* DRs in different loops never belong to the same group. */
2592 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2593 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2594 if (loopa != loopb)
2595 return loopa->num < loopb->num ? -1 : 1;
2597 /* Ordering of DRs according to base. */
2598 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2600 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2601 DR_BASE_ADDRESS (drb));
2602 if (cmp != 0)
2603 return cmp;
2606 /* And according to DR_OFFSET. */
2607 if (!dr_equal_offsets_p (dra, drb))
2609 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2610 if (cmp != 0)
2611 return cmp;
2614 /* Put reads before writes. */
2615 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2616 return DR_IS_READ (dra) ? -1 : 1;
2618 /* Then sort after access size. */
2619 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2620 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2622 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2623 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2624 if (cmp != 0)
2625 return cmp;
2628 /* And after step. */
2629 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2631 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2632 if (cmp != 0)
2633 return cmp;
2636 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2637 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2638 if (cmp == 0)
2639 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2640 return cmp;
2643 /* Function vect_analyze_data_ref_accesses.
2645 Analyze the access pattern of all the data references in the loop.
2647 FORNOW: the only access pattern that is considered vectorizable is a
2648 simple step 1 (consecutive) access.
2650 FORNOW: handle only arrays and pointer accesses. */
2652 bool
2653 vect_analyze_data_ref_accesses (vec_info *vinfo)
2655 unsigned int i;
2656 vec<data_reference_p> datarefs = vinfo->datarefs;
2657 struct data_reference *dr;
2659 if (dump_enabled_p ())
2660 dump_printf_loc (MSG_NOTE, vect_location,
2661 "=== vect_analyze_data_ref_accesses ===\n");
2663 if (datarefs.is_empty ())
2664 return true;
2666 /* Sort the array of datarefs to make building the interleaving chains
2667 linear. Don't modify the original vector's order, it is needed for
2668 determining what dependencies are reversed. */
2669 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2670 datarefs_copy.qsort (dr_group_sort_cmp);
2672 /* Build the interleaving chains. */
2673 for (i = 0; i < datarefs_copy.length () - 1;)
2675 data_reference_p dra = datarefs_copy[i];
2676 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2677 stmt_vec_info lastinfo = NULL;
2678 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2680 ++i;
2681 continue;
2683 for (i = i + 1; i < datarefs_copy.length (); ++i)
2685 data_reference_p drb = datarefs_copy[i];
2686 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2687 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2688 break;
2690 /* ??? Imperfect sorting (non-compatible types, non-modulo
2691 accesses, same accesses) can lead to a group to be artificially
2692 split here as we don't just skip over those. If it really
2693 matters we can push those to a worklist and re-iterate
2694 over them. The we can just skip ahead to the next DR here. */
2696 /* DRs in a different loop should not be put into the same
2697 interleaving group. */
2698 if (gimple_bb (DR_STMT (dra))->loop_father
2699 != gimple_bb (DR_STMT (drb))->loop_father)
2700 break;
2702 /* Check that the data-refs have same first location (except init)
2703 and they are both either store or load (not load and store,
2704 not masked loads or stores). */
2705 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2706 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2707 DR_BASE_ADDRESS (drb), 0)
2708 || !dr_equal_offsets_p (dra, drb)
2709 || !gimple_assign_single_p (DR_STMT (dra))
2710 || !gimple_assign_single_p (DR_STMT (drb)))
2711 break;
2713 /* Check that the data-refs have the same constant size. */
2714 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2715 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2716 if (!tree_fits_uhwi_p (sza)
2717 || !tree_fits_uhwi_p (szb)
2718 || !tree_int_cst_equal (sza, szb))
2719 break;
2721 /* Check that the data-refs have the same step. */
2722 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2723 break;
2725 /* Do not place the same access in the interleaving chain twice. */
2726 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2727 break;
2729 /* Check the types are compatible.
2730 ??? We don't distinguish this during sorting. */
2731 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2732 TREE_TYPE (DR_REF (drb))))
2733 break;
2735 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2736 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2737 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2738 gcc_assert (init_a <= init_b);
2740 /* If init_b == init_a + the size of the type * k, we have an
2741 interleaving, and DRA is accessed before DRB. */
2742 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2743 if (type_size_a == 0
2744 || (init_b - init_a) % type_size_a != 0)
2745 break;
2747 /* If we have a store, the accesses are adjacent. This splits
2748 groups into chunks we support (we don't support vectorization
2749 of stores with gaps). */
2750 if (!DR_IS_READ (dra)
2751 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2752 (DR_INIT (datarefs_copy[i-1]))
2753 != type_size_a))
2754 break;
2756 /* If the step (if not zero or non-constant) is greater than the
2757 difference between data-refs' inits this splits groups into
2758 suitable sizes. */
2759 if (tree_fits_shwi_p (DR_STEP (dra)))
2761 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2762 if (step != 0 && step <= (init_b - init_a))
2763 break;
2766 if (dump_enabled_p ())
2768 dump_printf_loc (MSG_NOTE, vect_location,
2769 "Detected interleaving ");
2770 if (DR_IS_READ (dra))
2771 dump_printf (MSG_NOTE, "load ");
2772 else
2773 dump_printf (MSG_NOTE, "store ");
2774 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2775 dump_printf (MSG_NOTE, " and ");
2776 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2777 dump_printf (MSG_NOTE, "\n");
2780 /* Link the found element into the group list. */
2781 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2783 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2784 lastinfo = stmtinfo_a;
2786 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2787 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2788 lastinfo = stmtinfo_b;
2792 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2793 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2794 && !vect_analyze_data_ref_access (dr))
2796 if (dump_enabled_p ())
2797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2798 "not vectorized: complicated access pattern.\n");
2800 if (is_a <bb_vec_info> (vinfo))
2802 /* Mark the statement as not vectorizable. */
2803 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2804 continue;
2806 else
2808 datarefs_copy.release ();
2809 return false;
2813 datarefs_copy.release ();
2814 return true;
2817 /* Function vect_vfa_segment_size.
2819 Create an expression that computes the size of segment
2820 that will be accessed for a data reference. The functions takes into
2821 account that realignment loads may access one more vector.
2823 Input:
2824 DR: The data reference.
2825 LENGTH_FACTOR: segment length to consider.
2827 Return an expression whose value is the size of segment which will be
2828 accessed by DR. */
2830 static tree
2831 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2833 tree segment_length;
2835 if (integer_zerop (DR_STEP (dr)))
2836 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2837 else
2838 segment_length = size_binop (MULT_EXPR,
2839 fold_convert (sizetype, DR_STEP (dr)),
2840 fold_convert (sizetype, length_factor));
2842 if (vect_supportable_dr_alignment (dr, false)
2843 == dr_explicit_realign_optimized)
2845 tree vector_size = TYPE_SIZE_UNIT
2846 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2848 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2850 return segment_length;
2853 /* Function vect_no_alias_p.
2855 Given data references A and B with equal base and offset, the alias
2856 relation can be decided at compilation time, return TRUE if they do
2857 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2858 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2859 respectively. */
2861 static bool
2862 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2863 tree segment_length_a, tree segment_length_b)
2865 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2866 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2867 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2868 return false;
2870 tree seg_a_min = DR_INIT (a);
2871 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2872 seg_a_min, segment_length_a);
2873 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2874 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2875 [a, a+12) */
2876 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2878 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2879 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2880 seg_a_max, unit_size);
2881 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2882 DR_INIT (a), unit_size);
2884 tree seg_b_min = DR_INIT (b);
2885 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
2886 seg_b_min, segment_length_b);
2887 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
2889 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
2890 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
2891 seg_b_max, unit_size);
2892 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
2893 DR_INIT (b), unit_size);
2896 if (tree_int_cst_le (seg_a_max, seg_b_min)
2897 || tree_int_cst_le (seg_b_max, seg_a_min))
2898 return true;
2900 return false;
2903 /* Function vect_prune_runtime_alias_test_list.
2905 Prune a list of ddrs to be tested at run-time by versioning for alias.
2906 Merge several alias checks into one if possible.
2907 Return FALSE if resulting list of ddrs is longer then allowed by
2908 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2910 bool
2911 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2913 vec<ddr_p> may_alias_ddrs =
2914 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2915 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2916 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2917 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2918 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2920 ddr_p ddr;
2921 unsigned int i;
2922 tree length_factor;
2924 if (dump_enabled_p ())
2925 dump_printf_loc (MSG_NOTE, vect_location,
2926 "=== vect_prune_runtime_alias_test_list ===\n");
2928 if (may_alias_ddrs.is_empty ())
2929 return true;
2931 comp_alias_ddrs.create (may_alias_ddrs.length ());
2933 /* First, we collect all data ref pairs for aliasing checks. */
2934 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2936 int comp_res;
2937 struct data_reference *dr_a, *dr_b;
2938 gimple *dr_group_first_a, *dr_group_first_b;
2939 tree segment_length_a, segment_length_b;
2940 gimple *stmt_a, *stmt_b;
2942 dr_a = DDR_A (ddr);
2943 stmt_a = DR_STMT (DDR_A (ddr));
2944 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2945 if (dr_group_first_a)
2947 stmt_a = dr_group_first_a;
2948 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2951 dr_b = DDR_B (ddr);
2952 stmt_b = DR_STMT (DDR_B (ddr));
2953 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2954 if (dr_group_first_b)
2956 stmt_b = dr_group_first_b;
2957 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2960 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2961 length_factor = scalar_loop_iters;
2962 else
2963 length_factor = size_int (vect_factor);
2964 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2965 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2967 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2968 DR_BASE_ADDRESS (dr_b));
2969 if (comp_res == 0)
2970 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
2971 DR_OFFSET (dr_b));
2973 /* Alias is known at compilation time. */
2974 if (comp_res == 0
2975 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
2976 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
2977 && TREE_CODE (segment_length_a) == INTEGER_CST
2978 && TREE_CODE (segment_length_b) == INTEGER_CST)
2980 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
2981 continue;
2983 if (dump_enabled_p ())
2984 dump_printf_loc (MSG_NOTE, vect_location,
2985 "not vectorized: compilation time alias.\n");
2987 return false;
2990 dr_with_seg_len_pair_t dr_with_seg_len_pair
2991 (dr_with_seg_len (dr_a, segment_length_a),
2992 dr_with_seg_len (dr_b, segment_length_b));
2994 /* Canonicalize pairs by sorting the two DR members. */
2995 if (comp_res > 0)
2996 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2998 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3001 prune_runtime_alias_test_list (&comp_alias_ddrs,
3002 (unsigned HOST_WIDE_INT) vect_factor);
3003 dump_printf_loc (MSG_NOTE, vect_location,
3004 "improved number of alias checks from %d to %d\n",
3005 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3006 if ((int) comp_alias_ddrs.length () >
3007 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3009 if (dump_enabled_p ())
3010 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3011 "number of versioning for alias "
3012 "run-time tests exceeds %d "
3013 "(--param vect-max-version-for-alias-checks)\n",
3014 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3015 return false;
3018 /* All alias checks have been resolved at compilation time. */
3019 if (!comp_alias_ddrs.length ())
3020 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3022 return true;
3025 /* Return true if a non-affine read or write in STMT is suitable for a
3026 gather load or scatter store. Describe the operation in *INFO if so. */
3028 bool
3029 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3030 gather_scatter_info *info)
3032 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3033 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3034 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3035 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3036 tree offtype = NULL_TREE;
3037 tree decl, base, off;
3038 machine_mode pmode;
3039 int punsignedp, reversep, pvolatilep = 0;
3041 base = DR_REF (dr);
3042 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3043 see if we can use the def stmt of the address. */
3044 if (is_gimple_call (stmt)
3045 && gimple_call_internal_p (stmt)
3046 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3047 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3048 && TREE_CODE (base) == MEM_REF
3049 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3050 && integer_zerop (TREE_OPERAND (base, 1))
3051 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3053 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3054 if (is_gimple_assign (def_stmt)
3055 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3056 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3059 /* The gather and scatter builtins need address of the form
3060 loop_invariant + vector * {1, 2, 4, 8}
3062 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3063 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3064 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3065 multiplications and additions in it. To get a vector, we need
3066 a single SSA_NAME that will be defined in the loop and will
3067 contain everything that is not loop invariant and that can be
3068 vectorized. The following code attempts to find such a preexistng
3069 SSA_NAME OFF and put the loop invariants into a tree BASE
3070 that can be gimplified before the loop. */
3071 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3072 &punsignedp, &reversep, &pvolatilep);
3073 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3075 if (TREE_CODE (base) == MEM_REF)
3077 if (!integer_zerop (TREE_OPERAND (base, 1)))
3079 if (off == NULL_TREE)
3081 offset_int moff = mem_ref_offset (base);
3082 off = wide_int_to_tree (sizetype, moff);
3084 else
3085 off = size_binop (PLUS_EXPR, off,
3086 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3088 base = TREE_OPERAND (base, 0);
3090 else
3091 base = build_fold_addr_expr (base);
3093 if (off == NULL_TREE)
3094 off = size_zero_node;
3096 /* If base is not loop invariant, either off is 0, then we start with just
3097 the constant offset in the loop invariant BASE and continue with base
3098 as OFF, otherwise give up.
3099 We could handle that case by gimplifying the addition of base + off
3100 into some SSA_NAME and use that as off, but for now punt. */
3101 if (!expr_invariant_in_loop_p (loop, base))
3103 if (!integer_zerop (off))
3104 return false;
3105 off = base;
3106 base = size_int (pbitpos / BITS_PER_UNIT);
3108 /* Otherwise put base + constant offset into the loop invariant BASE
3109 and continue with OFF. */
3110 else
3112 base = fold_convert (sizetype, base);
3113 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3116 /* OFF at this point may be either a SSA_NAME or some tree expression
3117 from get_inner_reference. Try to peel off loop invariants from it
3118 into BASE as long as possible. */
3119 STRIP_NOPS (off);
3120 while (offtype == NULL_TREE)
3122 enum tree_code code;
3123 tree op0, op1, add = NULL_TREE;
3125 if (TREE_CODE (off) == SSA_NAME)
3127 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3129 if (expr_invariant_in_loop_p (loop, off))
3130 return false;
3132 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3133 break;
3135 op0 = gimple_assign_rhs1 (def_stmt);
3136 code = gimple_assign_rhs_code (def_stmt);
3137 op1 = gimple_assign_rhs2 (def_stmt);
3139 else
3141 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3142 return false;
3143 code = TREE_CODE (off);
3144 extract_ops_from_tree (off, &code, &op0, &op1);
3146 switch (code)
3148 case POINTER_PLUS_EXPR:
3149 case PLUS_EXPR:
3150 if (expr_invariant_in_loop_p (loop, op0))
3152 add = op0;
3153 off = op1;
3154 do_add:
3155 add = fold_convert (sizetype, add);
3156 if (scale != 1)
3157 add = size_binop (MULT_EXPR, add, size_int (scale));
3158 base = size_binop (PLUS_EXPR, base, add);
3159 continue;
3161 if (expr_invariant_in_loop_p (loop, op1))
3163 add = op1;
3164 off = op0;
3165 goto do_add;
3167 break;
3168 case MINUS_EXPR:
3169 if (expr_invariant_in_loop_p (loop, op1))
3171 add = fold_convert (sizetype, op1);
3172 add = size_binop (MINUS_EXPR, size_zero_node, add);
3173 off = op0;
3174 goto do_add;
3176 break;
3177 case MULT_EXPR:
3178 if (scale == 1 && tree_fits_shwi_p (op1))
3180 scale = tree_to_shwi (op1);
3181 off = op0;
3182 continue;
3184 break;
3185 case SSA_NAME:
3186 off = op0;
3187 continue;
3188 CASE_CONVERT:
3189 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3190 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3191 break;
3192 if (TYPE_PRECISION (TREE_TYPE (op0))
3193 == TYPE_PRECISION (TREE_TYPE (off)))
3195 off = op0;
3196 continue;
3198 if (TYPE_PRECISION (TREE_TYPE (op0))
3199 < TYPE_PRECISION (TREE_TYPE (off)))
3201 off = op0;
3202 offtype = TREE_TYPE (off);
3203 STRIP_NOPS (off);
3204 continue;
3206 break;
3207 default:
3208 break;
3210 break;
3213 /* If at the end OFF still isn't a SSA_NAME or isn't
3214 defined in the loop, punt. */
3215 if (TREE_CODE (off) != SSA_NAME
3216 || expr_invariant_in_loop_p (loop, off))
3217 return false;
3219 if (offtype == NULL_TREE)
3220 offtype = TREE_TYPE (off);
3222 if (DR_IS_READ (dr))
3223 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3224 offtype, scale);
3225 else
3226 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3227 offtype, scale);
3229 if (decl == NULL_TREE)
3230 return false;
3232 info->decl = decl;
3233 info->base = base;
3234 info->offset = off;
3235 info->offset_dt = vect_unknown_def_type;
3236 info->offset_vectype = NULL_TREE;
3237 info->scale = scale;
3238 return true;
3241 /* Function vect_analyze_data_refs.
3243 Find all the data references in the loop or basic block.
3245 The general structure of the analysis of data refs in the vectorizer is as
3246 follows:
3247 1- vect_analyze_data_refs(loop/bb): call
3248 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3249 in the loop/bb and their dependences.
3250 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3251 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3252 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3256 bool
3257 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3259 struct loop *loop = NULL;
3260 unsigned int i;
3261 struct data_reference *dr;
3262 tree scalar_type;
3264 if (dump_enabled_p ())
3265 dump_printf_loc (MSG_NOTE, vect_location,
3266 "=== vect_analyze_data_refs ===\n");
3268 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3269 loop = LOOP_VINFO_LOOP (loop_vinfo);
3271 /* Go through the data-refs, check that the analysis succeeded. Update
3272 pointer from stmt_vec_info struct to DR and vectype. */
3274 vec<data_reference_p> datarefs = vinfo->datarefs;
3275 FOR_EACH_VEC_ELT (datarefs, i, dr)
3277 gimple *stmt;
3278 stmt_vec_info stmt_info;
3279 tree base, offset, init;
3280 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3281 bool simd_lane_access = false;
3282 int vf;
3284 again:
3285 if (!dr || !DR_REF (dr))
3287 if (dump_enabled_p ())
3288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3289 "not vectorized: unhandled data-ref\n");
3290 return false;
3293 stmt = DR_STMT (dr);
3294 stmt_info = vinfo_for_stmt (stmt);
3296 /* Discard clobbers from the dataref vector. We will remove
3297 clobber stmts during vectorization. */
3298 if (gimple_clobber_p (stmt))
3300 free_data_ref (dr);
3301 if (i == datarefs.length () - 1)
3303 datarefs.pop ();
3304 break;
3306 datarefs.ordered_remove (i);
3307 dr = datarefs[i];
3308 goto again;
3311 /* Check that analysis of the data-ref succeeded. */
3312 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3313 || !DR_STEP (dr))
3315 bool maybe_gather
3316 = DR_IS_READ (dr)
3317 && !TREE_THIS_VOLATILE (DR_REF (dr))
3318 && targetm.vectorize.builtin_gather != NULL;
3319 bool maybe_scatter
3320 = DR_IS_WRITE (dr)
3321 && !TREE_THIS_VOLATILE (DR_REF (dr))
3322 && targetm.vectorize.builtin_scatter != NULL;
3323 bool maybe_simd_lane_access
3324 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3326 /* If target supports vector gather loads or scatter stores, or if
3327 this might be a SIMD lane access, see if they can't be used. */
3328 if (is_a <loop_vec_info> (vinfo)
3329 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3330 && !nested_in_vect_loop_p (loop, stmt))
3332 struct data_reference *newdr
3333 = create_data_ref (NULL, loop_containing_stmt (stmt),
3334 DR_REF (dr), stmt, maybe_scatter ? false : true);
3335 gcc_assert (newdr != NULL && DR_REF (newdr));
3336 if (DR_BASE_ADDRESS (newdr)
3337 && DR_OFFSET (newdr)
3338 && DR_INIT (newdr)
3339 && DR_STEP (newdr)
3340 && integer_zerop (DR_STEP (newdr)))
3342 if (maybe_simd_lane_access)
3344 tree off = DR_OFFSET (newdr);
3345 STRIP_NOPS (off);
3346 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3347 && TREE_CODE (off) == MULT_EXPR
3348 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3350 tree step = TREE_OPERAND (off, 1);
3351 off = TREE_OPERAND (off, 0);
3352 STRIP_NOPS (off);
3353 if (CONVERT_EXPR_P (off)
3354 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3355 0)))
3356 < TYPE_PRECISION (TREE_TYPE (off)))
3357 off = TREE_OPERAND (off, 0);
3358 if (TREE_CODE (off) == SSA_NAME)
3360 gimple *def = SSA_NAME_DEF_STMT (off);
3361 tree reft = TREE_TYPE (DR_REF (newdr));
3362 if (is_gimple_call (def)
3363 && gimple_call_internal_p (def)
3364 && (gimple_call_internal_fn (def)
3365 == IFN_GOMP_SIMD_LANE))
3367 tree arg = gimple_call_arg (def, 0);
3368 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3369 arg = SSA_NAME_VAR (arg);
3370 if (arg == loop->simduid
3371 /* For now. */
3372 && tree_int_cst_equal
3373 (TYPE_SIZE_UNIT (reft),
3374 step))
3376 DR_OFFSET (newdr) = ssize_int (0);
3377 DR_STEP (newdr) = step;
3378 DR_ALIGNED_TO (newdr)
3379 = size_int (BIGGEST_ALIGNMENT);
3380 dr = newdr;
3381 simd_lane_access = true;
3387 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3389 dr = newdr;
3390 if (maybe_gather)
3391 gatherscatter = GATHER;
3392 else
3393 gatherscatter = SCATTER;
3396 if (gatherscatter == SG_NONE && !simd_lane_access)
3397 free_data_ref (newdr);
3400 if (gatherscatter == SG_NONE && !simd_lane_access)
3402 if (dump_enabled_p ())
3404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3405 "not vectorized: data ref analysis "
3406 "failed ");
3407 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3410 if (is_a <bb_vec_info> (vinfo))
3411 break;
3413 return false;
3417 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3419 if (dump_enabled_p ())
3420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3421 "not vectorized: base addr of dr is a "
3422 "constant\n");
3424 if (is_a <bb_vec_info> (vinfo))
3425 break;
3427 if (gatherscatter != SG_NONE || simd_lane_access)
3428 free_data_ref (dr);
3429 return false;
3432 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3434 if (dump_enabled_p ())
3436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3437 "not vectorized: volatile type ");
3438 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3441 if (is_a <bb_vec_info> (vinfo))
3442 break;
3444 return false;
3447 if (stmt_can_throw_internal (stmt))
3449 if (dump_enabled_p ())
3451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3452 "not vectorized: statement can throw an "
3453 "exception ");
3454 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3457 if (is_a <bb_vec_info> (vinfo))
3458 break;
3460 if (gatherscatter != SG_NONE || simd_lane_access)
3461 free_data_ref (dr);
3462 return false;
3465 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3466 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3468 if (dump_enabled_p ())
3470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3471 "not vectorized: statement is bitfield "
3472 "access ");
3473 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3476 if (is_a <bb_vec_info> (vinfo))
3477 break;
3479 if (gatherscatter != SG_NONE || simd_lane_access)
3480 free_data_ref (dr);
3481 return false;
3484 base = unshare_expr (DR_BASE_ADDRESS (dr));
3485 offset = unshare_expr (DR_OFFSET (dr));
3486 init = unshare_expr (DR_INIT (dr));
3488 if (is_gimple_call (stmt)
3489 && (!gimple_call_internal_p (stmt)
3490 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3491 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3493 if (dump_enabled_p ())
3495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3496 "not vectorized: dr in a call ");
3497 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3500 if (is_a <bb_vec_info> (vinfo))
3501 break;
3503 if (gatherscatter != SG_NONE || simd_lane_access)
3504 free_data_ref (dr);
3505 return false;
3508 /* Update DR field in stmt_vec_info struct. */
3510 /* If the dataref is in an inner-loop of the loop that is considered for
3511 for vectorization, we also want to analyze the access relative to
3512 the outer-loop (DR contains information only relative to the
3513 inner-most enclosing loop). We do that by building a reference to the
3514 first location accessed by the inner-loop, and analyze it relative to
3515 the outer-loop. */
3516 if (loop && nested_in_vect_loop_p (loop, stmt))
3518 tree outer_step, outer_base, outer_init;
3519 HOST_WIDE_INT pbitsize, pbitpos;
3520 tree poffset;
3521 machine_mode pmode;
3522 int punsignedp, preversep, pvolatilep;
3523 affine_iv base_iv, offset_iv;
3524 tree dinit;
3526 /* Build a reference to the first location accessed by the
3527 inner-loop: *(BASE+INIT). (The first location is actually
3528 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3529 tree inner_base = build_fold_indirect_ref
3530 (fold_build_pointer_plus (base, init));
3532 if (dump_enabled_p ())
3534 dump_printf_loc (MSG_NOTE, vect_location,
3535 "analyze in outer-loop: ");
3536 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3537 dump_printf (MSG_NOTE, "\n");
3540 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3541 &poffset, &pmode, &punsignedp,
3542 &preversep, &pvolatilep);
3543 gcc_assert (outer_base != NULL_TREE);
3545 if (pbitpos % BITS_PER_UNIT != 0)
3547 if (dump_enabled_p ())
3548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3549 "failed: bit offset alignment.\n");
3550 return false;
3553 if (preversep)
3555 if (dump_enabled_p ())
3556 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3557 "failed: reverse storage order.\n");
3558 return false;
3561 outer_base = build_fold_addr_expr (outer_base);
3562 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3563 &base_iv, false))
3565 if (dump_enabled_p ())
3566 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3567 "failed: evolution of base is not affine.\n");
3568 return false;
3571 if (offset)
3573 if (poffset)
3574 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3575 poffset);
3576 else
3577 poffset = offset;
3580 if (!poffset)
3582 offset_iv.base = ssize_int (0);
3583 offset_iv.step = ssize_int (0);
3585 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3586 &offset_iv, false))
3588 if (dump_enabled_p ())
3589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3590 "evolution of offset is not affine.\n");
3591 return false;
3594 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3595 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3596 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3597 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3598 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3600 outer_step = size_binop (PLUS_EXPR,
3601 fold_convert (ssizetype, base_iv.step),
3602 fold_convert (ssizetype, offset_iv.step));
3604 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3605 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3606 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3607 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3608 STMT_VINFO_DR_OFFSET (stmt_info) =
3609 fold_convert (ssizetype, offset_iv.base);
3610 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3611 size_int (highest_pow2_factor (offset_iv.base));
3613 if (dump_enabled_p ())
3615 dump_printf_loc (MSG_NOTE, vect_location,
3616 "\touter base_address: ");
3617 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3618 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3619 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3620 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3621 STMT_VINFO_DR_OFFSET (stmt_info));
3622 dump_printf (MSG_NOTE,
3623 "\n\touter constant offset from base address: ");
3624 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3625 STMT_VINFO_DR_INIT (stmt_info));
3626 dump_printf (MSG_NOTE, "\n\touter step: ");
3627 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3628 STMT_VINFO_DR_STEP (stmt_info));
3629 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3630 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3631 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3632 dump_printf (MSG_NOTE, "\n");
3636 if (STMT_VINFO_DATA_REF (stmt_info))
3638 if (dump_enabled_p ())
3640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3641 "not vectorized: more than one data ref "
3642 "in stmt: ");
3643 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3646 if (is_a <bb_vec_info> (vinfo))
3647 break;
3649 if (gatherscatter != SG_NONE || simd_lane_access)
3650 free_data_ref (dr);
3651 return false;
3654 STMT_VINFO_DATA_REF (stmt_info) = dr;
3655 if (simd_lane_access)
3657 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3658 free_data_ref (datarefs[i]);
3659 datarefs[i] = dr;
3662 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3663 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3664 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3666 if (dump_enabled_p ())
3668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3669 "not vectorized: base object not addressable "
3670 "for stmt: ");
3671 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3673 if (is_a <bb_vec_info> (vinfo))
3675 /* In BB vectorization the ref can still participate
3676 in dependence analysis, we just can't vectorize it. */
3677 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3678 continue;
3680 return false;
3683 /* Set vectype for STMT. */
3684 scalar_type = TREE_TYPE (DR_REF (dr));
3685 STMT_VINFO_VECTYPE (stmt_info)
3686 = get_vectype_for_scalar_type (scalar_type);
3687 if (!STMT_VINFO_VECTYPE (stmt_info))
3689 if (dump_enabled_p ())
3691 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3692 "not vectorized: no vectype for stmt: ");
3693 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3694 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3695 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3696 scalar_type);
3697 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3700 if (is_a <bb_vec_info> (vinfo))
3702 /* No vector type is fine, the ref can still participate
3703 in dependence analysis, we just can't vectorize it. */
3704 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3705 continue;
3708 if (gatherscatter != SG_NONE || simd_lane_access)
3710 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3711 if (gatherscatter != SG_NONE)
3712 free_data_ref (dr);
3714 return false;
3716 else
3718 if (dump_enabled_p ())
3720 dump_printf_loc (MSG_NOTE, vect_location,
3721 "got vectype for stmt: ");
3722 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3723 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3724 STMT_VINFO_VECTYPE (stmt_info));
3725 dump_printf (MSG_NOTE, "\n");
3729 /* Adjust the minimal vectorization factor according to the
3730 vector type. */
3731 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3732 if (vf > *min_vf)
3733 *min_vf = vf;
3735 if (gatherscatter != SG_NONE)
3737 gather_scatter_info gs_info;
3738 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3739 &gs_info)
3740 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3742 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3743 free_data_ref (dr);
3744 if (dump_enabled_p ())
3746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3747 (gatherscatter == GATHER) ?
3748 "not vectorized: not suitable for gather "
3749 "load " :
3750 "not vectorized: not suitable for scatter "
3751 "store ");
3752 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3754 return false;
3757 free_data_ref (datarefs[i]);
3758 datarefs[i] = dr;
3759 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3762 else if (is_a <loop_vec_info> (vinfo)
3763 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3765 if (nested_in_vect_loop_p (loop, stmt))
3767 if (dump_enabled_p ())
3769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3770 "not vectorized: not suitable for strided "
3771 "load ");
3772 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3774 return false;
3776 STMT_VINFO_STRIDED_P (stmt_info) = true;
3780 /* If we stopped analysis at the first dataref we could not analyze
3781 when trying to vectorize a basic-block mark the rest of the datarefs
3782 as not vectorizable and truncate the vector of datarefs. That
3783 avoids spending useless time in analyzing their dependence. */
3784 if (i != datarefs.length ())
3786 gcc_assert (is_a <bb_vec_info> (vinfo));
3787 for (unsigned j = i; j < datarefs.length (); ++j)
3789 data_reference_p dr = datarefs[j];
3790 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3791 free_data_ref (dr);
3793 datarefs.truncate (i);
3796 return true;
3800 /* Function vect_get_new_vect_var.
3802 Returns a name for a new variable. The current naming scheme appends the
3803 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3804 the name of vectorizer generated variables, and appends that to NAME if
3805 provided. */
3807 tree
3808 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3810 const char *prefix;
3811 tree new_vect_var;
3813 switch (var_kind)
3815 case vect_simple_var:
3816 prefix = "vect";
3817 break;
3818 case vect_scalar_var:
3819 prefix = "stmp";
3820 break;
3821 case vect_mask_var:
3822 prefix = "mask";
3823 break;
3824 case vect_pointer_var:
3825 prefix = "vectp";
3826 break;
3827 default:
3828 gcc_unreachable ();
3831 if (name)
3833 char* tmp = concat (prefix, "_", name, NULL);
3834 new_vect_var = create_tmp_reg (type, tmp);
3835 free (tmp);
3837 else
3838 new_vect_var = create_tmp_reg (type, prefix);
3840 return new_vect_var;
3843 /* Like vect_get_new_vect_var but return an SSA name. */
3845 tree
3846 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3848 const char *prefix;
3849 tree new_vect_var;
3851 switch (var_kind)
3853 case vect_simple_var:
3854 prefix = "vect";
3855 break;
3856 case vect_scalar_var:
3857 prefix = "stmp";
3858 break;
3859 case vect_pointer_var:
3860 prefix = "vectp";
3861 break;
3862 default:
3863 gcc_unreachable ();
3866 if (name)
3868 char* tmp = concat (prefix, "_", name, NULL);
3869 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3870 free (tmp);
3872 else
3873 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3875 return new_vect_var;
3878 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3880 static void
3881 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3882 stmt_vec_info stmt_info)
3884 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3885 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3886 int misalign = DR_MISALIGNMENT (dr);
3887 if (misalign == -1)
3888 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3889 else
3890 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3893 /* Function vect_create_addr_base_for_vector_ref.
3895 Create an expression that computes the address of the first memory location
3896 that will be accessed for a data reference.
3898 Input:
3899 STMT: The statement containing the data reference.
3900 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3901 OFFSET: Optional. If supplied, it is be added to the initial address.
3902 LOOP: Specify relative to which loop-nest should the address be computed.
3903 For example, when the dataref is in an inner-loop nested in an
3904 outer-loop that is now being vectorized, LOOP can be either the
3905 outer-loop, or the inner-loop. The first memory location accessed
3906 by the following dataref ('in' points to short):
3908 for (i=0; i<N; i++)
3909 for (j=0; j<M; j++)
3910 s += in[i+j]
3912 is as follows:
3913 if LOOP=i_loop: &in (relative to i_loop)
3914 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3915 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3916 initial address. Unlike OFFSET, which is number of elements to
3917 be added, BYTE_OFFSET is measured in bytes.
3919 Output:
3920 1. Return an SSA_NAME whose value is the address of the memory location of
3921 the first vector of the data reference.
3922 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3923 these statement(s) which define the returned SSA_NAME.
3925 FORNOW: We are only handling array accesses with step 1. */
3927 tree
3928 vect_create_addr_base_for_vector_ref (gimple *stmt,
3929 gimple_seq *new_stmt_list,
3930 tree offset,
3931 struct loop *loop,
3932 tree byte_offset)
3934 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3935 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3936 tree data_ref_base;
3937 const char *base_name;
3938 tree addr_base;
3939 tree dest;
3940 gimple_seq seq = NULL;
3941 tree base_offset;
3942 tree init;
3943 tree vect_ptr_type;
3944 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3945 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3947 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3949 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3951 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3953 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3954 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3955 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3957 else
3959 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3960 base_offset = unshare_expr (DR_OFFSET (dr));
3961 init = unshare_expr (DR_INIT (dr));
3964 if (loop_vinfo)
3965 base_name = get_name (data_ref_base);
3966 else
3968 base_offset = ssize_int (0);
3969 init = ssize_int (0);
3970 base_name = get_name (DR_REF (dr));
3973 /* Create base_offset */
3974 base_offset = size_binop (PLUS_EXPR,
3975 fold_convert (sizetype, base_offset),
3976 fold_convert (sizetype, init));
3978 if (offset)
3980 offset = fold_build2 (MULT_EXPR, sizetype,
3981 fold_convert (sizetype, offset), step);
3982 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3983 base_offset, offset);
3985 if (byte_offset)
3987 byte_offset = fold_convert (sizetype, byte_offset);
3988 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3989 base_offset, byte_offset);
3992 /* base + base_offset */
3993 if (loop_vinfo)
3994 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3995 else
3997 addr_base = build1 (ADDR_EXPR,
3998 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3999 unshare_expr (DR_REF (dr)));
4002 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4003 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4004 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4005 gimple_seq_add_seq (new_stmt_list, seq);
4007 if (DR_PTR_INFO (dr)
4008 && TREE_CODE (addr_base) == SSA_NAME
4009 && !SSA_NAME_PTR_INFO (addr_base))
4011 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4012 if (offset || byte_offset)
4013 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4016 if (dump_enabled_p ())
4018 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4019 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4020 dump_printf (MSG_NOTE, "\n");
4023 return addr_base;
4027 /* Function vect_create_data_ref_ptr.
4029 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4030 location accessed in the loop by STMT, along with the def-use update
4031 chain to appropriately advance the pointer through the loop iterations.
4032 Also set aliasing information for the pointer. This pointer is used by
4033 the callers to this function to create a memory reference expression for
4034 vector load/store access.
4036 Input:
4037 1. STMT: a stmt that references memory. Expected to be of the form
4038 GIMPLE_ASSIGN <name, data-ref> or
4039 GIMPLE_ASSIGN <data-ref, name>.
4040 2. AGGR_TYPE: the type of the reference, which should be either a vector
4041 or an array.
4042 3. AT_LOOP: the loop where the vector memref is to be created.
4043 4. OFFSET (optional): an offset to be added to the initial address accessed
4044 by the data-ref in STMT.
4045 5. BSI: location where the new stmts are to be placed if there is no loop
4046 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4047 pointing to the initial address.
4048 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4049 to the initial address accessed by the data-ref in STMT. This is
4050 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4051 in bytes.
4053 Output:
4054 1. Declare a new ptr to vector_type, and have it point to the base of the
4055 data reference (initial addressed accessed by the data reference).
4056 For example, for vector of type V8HI, the following code is generated:
4058 v8hi *ap;
4059 ap = (v8hi *)initial_address;
4061 if OFFSET is not supplied:
4062 initial_address = &a[init];
4063 if OFFSET is supplied:
4064 initial_address = &a[init + OFFSET];
4065 if BYTE_OFFSET is supplied:
4066 initial_address = &a[init] + BYTE_OFFSET;
4068 Return the initial_address in INITIAL_ADDRESS.
4070 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4071 update the pointer in each iteration of the loop.
4073 Return the increment stmt that updates the pointer in PTR_INCR.
4075 3. Set INV_P to true if the access pattern of the data reference in the
4076 vectorized loop is invariant. Set it to false otherwise.
4078 4. Return the pointer. */
4080 tree
4081 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4082 tree offset, tree *initial_address,
4083 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4084 bool only_init, bool *inv_p, tree byte_offset)
4086 const char *base_name;
4087 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4088 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4089 struct loop *loop = NULL;
4090 bool nested_in_vect_loop = false;
4091 struct loop *containing_loop = NULL;
4092 tree aggr_ptr_type;
4093 tree aggr_ptr;
4094 tree new_temp;
4095 gimple_seq new_stmt_list = NULL;
4096 edge pe = NULL;
4097 basic_block new_bb;
4098 tree aggr_ptr_init;
4099 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4100 tree aptr;
4101 gimple_stmt_iterator incr_gsi;
4102 bool insert_after;
4103 tree indx_before_incr, indx_after_incr;
4104 gimple *incr;
4105 tree step;
4106 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4108 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4109 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4111 if (loop_vinfo)
4113 loop = LOOP_VINFO_LOOP (loop_vinfo);
4114 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4115 containing_loop = (gimple_bb (stmt))->loop_father;
4116 pe = loop_preheader_edge (loop);
4118 else
4120 gcc_assert (bb_vinfo);
4121 only_init = true;
4122 *ptr_incr = NULL;
4125 /* Check the step (evolution) of the load in LOOP, and record
4126 whether it's invariant. */
4127 if (nested_in_vect_loop)
4128 step = STMT_VINFO_DR_STEP (stmt_info);
4129 else
4130 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4132 if (integer_zerop (step))
4133 *inv_p = true;
4134 else
4135 *inv_p = false;
4137 /* Create an expression for the first address accessed by this load
4138 in LOOP. */
4139 base_name = get_name (DR_BASE_ADDRESS (dr));
4141 if (dump_enabled_p ())
4143 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4144 dump_printf_loc (MSG_NOTE, vect_location,
4145 "create %s-pointer variable to type: ",
4146 get_tree_code_name (TREE_CODE (aggr_type)));
4147 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4148 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4149 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4150 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4151 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4152 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4153 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4154 else
4155 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4157 dump_printf (MSG_NOTE, "\n");
4160 /* (1) Create the new aggregate-pointer variable.
4161 Vector and array types inherit the alias set of their component
4162 type by default so we need to use a ref-all pointer if the data
4163 reference does not conflict with the created aggregated data
4164 reference because it is not addressable. */
4165 bool need_ref_all = false;
4166 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4167 get_alias_set (DR_REF (dr))))
4168 need_ref_all = true;
4169 /* Likewise for any of the data references in the stmt group. */
4170 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4172 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4175 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4176 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4177 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4178 get_alias_set (DR_REF (sdr))))
4180 need_ref_all = true;
4181 break;
4183 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4185 while (orig_stmt);
4187 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4188 need_ref_all);
4189 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4192 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4193 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4194 def-use update cycles for the pointer: one relative to the outer-loop
4195 (LOOP), which is what steps (3) and (4) below do. The other is relative
4196 to the inner-loop (which is the inner-most loop containing the dataref),
4197 and this is done be step (5) below.
4199 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4200 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4201 redundant. Steps (3),(4) create the following:
4203 vp0 = &base_addr;
4204 LOOP: vp1 = phi(vp0,vp2)
4207 vp2 = vp1 + step
4208 goto LOOP
4210 If there is an inner-loop nested in loop, then step (5) will also be
4211 applied, and an additional update in the inner-loop will be created:
4213 vp0 = &base_addr;
4214 LOOP: vp1 = phi(vp0,vp2)
4216 inner: vp3 = phi(vp1,vp4)
4217 vp4 = vp3 + inner_step
4218 if () goto inner
4220 vp2 = vp1 + step
4221 if () goto LOOP */
4223 /* (2) Calculate the initial address of the aggregate-pointer, and set
4224 the aggregate-pointer to point to it before the loop. */
4226 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4228 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4229 offset, loop, byte_offset);
4230 if (new_stmt_list)
4232 if (pe)
4234 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4235 gcc_assert (!new_bb);
4237 else
4238 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4241 *initial_address = new_temp;
4242 aggr_ptr_init = new_temp;
4244 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4245 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4246 inner-loop nested in LOOP (during outer-loop vectorization). */
4248 /* No update in loop is required. */
4249 if (only_init && (!loop_vinfo || at_loop == loop))
4250 aptr = aggr_ptr_init;
4251 else
4253 /* The step of the aggregate pointer is the type size. */
4254 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4255 /* One exception to the above is when the scalar step of the load in
4256 LOOP is zero. In this case the step here is also zero. */
4257 if (*inv_p)
4258 iv_step = size_zero_node;
4259 else if (tree_int_cst_sgn (step) == -1)
4260 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4262 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4264 create_iv (aggr_ptr_init,
4265 fold_convert (aggr_ptr_type, iv_step),
4266 aggr_ptr, loop, &incr_gsi, insert_after,
4267 &indx_before_incr, &indx_after_incr);
4268 incr = gsi_stmt (incr_gsi);
4269 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4271 /* Copy the points-to information if it exists. */
4272 if (DR_PTR_INFO (dr))
4274 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4275 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4277 if (ptr_incr)
4278 *ptr_incr = incr;
4280 aptr = indx_before_incr;
4283 if (!nested_in_vect_loop || only_init)
4284 return aptr;
4287 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4288 nested in LOOP, if exists. */
4290 gcc_assert (nested_in_vect_loop);
4291 if (!only_init)
4293 standard_iv_increment_position (containing_loop, &incr_gsi,
4294 &insert_after);
4295 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4296 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4297 &indx_after_incr);
4298 incr = gsi_stmt (incr_gsi);
4299 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4301 /* Copy the points-to information if it exists. */
4302 if (DR_PTR_INFO (dr))
4304 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4305 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4307 if (ptr_incr)
4308 *ptr_incr = incr;
4310 return indx_before_incr;
4312 else
4313 gcc_unreachable ();
4317 /* Function bump_vector_ptr
4319 Increment a pointer (to a vector type) by vector-size. If requested,
4320 i.e. if PTR-INCR is given, then also connect the new increment stmt
4321 to the existing def-use update-chain of the pointer, by modifying
4322 the PTR_INCR as illustrated below:
4324 The pointer def-use update-chain before this function:
4325 DATAREF_PTR = phi (p_0, p_2)
4326 ....
4327 PTR_INCR: p_2 = DATAREF_PTR + step
4329 The pointer def-use update-chain after this function:
4330 DATAREF_PTR = phi (p_0, p_2)
4331 ....
4332 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4333 ....
4334 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4336 Input:
4337 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4338 in the loop.
4339 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4340 the loop. The increment amount across iterations is expected
4341 to be vector_size.
4342 BSI - location where the new update stmt is to be placed.
4343 STMT - the original scalar memory-access stmt that is being vectorized.
4344 BUMP - optional. The offset by which to bump the pointer. If not given,
4345 the offset is assumed to be vector_size.
4347 Output: Return NEW_DATAREF_PTR as illustrated above.
4351 tree
4352 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4353 gimple *stmt, tree bump)
4355 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4356 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4357 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4358 tree update = TYPE_SIZE_UNIT (vectype);
4359 gassign *incr_stmt;
4360 ssa_op_iter iter;
4361 use_operand_p use_p;
4362 tree new_dataref_ptr;
4364 if (bump)
4365 update = bump;
4367 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4368 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4369 else
4370 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4371 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4372 dataref_ptr, update);
4373 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4375 /* Copy the points-to information if it exists. */
4376 if (DR_PTR_INFO (dr))
4378 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4379 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4382 if (!ptr_incr)
4383 return new_dataref_ptr;
4385 /* Update the vector-pointer's cross-iteration increment. */
4386 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4388 tree use = USE_FROM_PTR (use_p);
4390 if (use == dataref_ptr)
4391 SET_USE (use_p, new_dataref_ptr);
4392 else
4393 gcc_assert (tree_int_cst_compare (use, update) == 0);
4396 return new_dataref_ptr;
4400 /* Function vect_create_destination_var.
4402 Create a new temporary of type VECTYPE. */
4404 tree
4405 vect_create_destination_var (tree scalar_dest, tree vectype)
4407 tree vec_dest;
4408 const char *name;
4409 char *new_name;
4410 tree type;
4411 enum vect_var_kind kind;
4413 kind = vectype
4414 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4415 ? vect_mask_var
4416 : vect_simple_var
4417 : vect_scalar_var;
4418 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4420 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4422 name = get_name (scalar_dest);
4423 if (name)
4424 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4425 else
4426 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4427 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4428 free (new_name);
4430 return vec_dest;
4433 /* Function vect_grouped_store_supported.
4435 Returns TRUE if interleave high and interleave low permutations
4436 are supported, and FALSE otherwise. */
4438 bool
4439 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4441 machine_mode mode = TYPE_MODE (vectype);
4443 /* vect_permute_store_chain requires the group size to be equal to 3 or
4444 be a power of two. */
4445 if (count != 3 && exact_log2 (count) == -1)
4447 if (dump_enabled_p ())
4448 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4449 "the size of the group of accesses"
4450 " is not a power of 2 or not eqaul to 3\n");
4451 return false;
4454 /* Check that the permutation is supported. */
4455 if (VECTOR_MODE_P (mode))
4457 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4458 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4460 if (count == 3)
4462 unsigned int j0 = 0, j1 = 0, j2 = 0;
4463 unsigned int i, j;
4465 for (j = 0; j < 3; j++)
4467 int nelt0 = ((3 - j) * nelt) % 3;
4468 int nelt1 = ((3 - j) * nelt + 1) % 3;
4469 int nelt2 = ((3 - j) * nelt + 2) % 3;
4470 for (i = 0; i < nelt; i++)
4472 if (3 * i + nelt0 < nelt)
4473 sel[3 * i + nelt0] = j0++;
4474 if (3 * i + nelt1 < nelt)
4475 sel[3 * i + nelt1] = nelt + j1++;
4476 if (3 * i + nelt2 < nelt)
4477 sel[3 * i + nelt2] = 0;
4479 if (!can_vec_perm_p (mode, false, sel))
4481 if (dump_enabled_p ())
4482 dump_printf (MSG_MISSED_OPTIMIZATION,
4483 "permutaion op not supported by target.\n");
4484 return false;
4487 for (i = 0; i < nelt; i++)
4489 if (3 * i + nelt0 < nelt)
4490 sel[3 * i + nelt0] = 3 * i + nelt0;
4491 if (3 * i + nelt1 < nelt)
4492 sel[3 * i + nelt1] = 3 * i + nelt1;
4493 if (3 * i + nelt2 < nelt)
4494 sel[3 * i + nelt2] = nelt + j2++;
4496 if (!can_vec_perm_p (mode, false, sel))
4498 if (dump_enabled_p ())
4499 dump_printf (MSG_MISSED_OPTIMIZATION,
4500 "permutaion op not supported by target.\n");
4501 return false;
4504 return true;
4506 else
4508 /* If length is not equal to 3 then only power of 2 is supported. */
4509 gcc_assert (pow2p_hwi (count));
4511 for (i = 0; i < nelt / 2; i++)
4513 sel[i * 2] = i;
4514 sel[i * 2 + 1] = i + nelt;
4516 if (can_vec_perm_p (mode, false, sel))
4518 for (i = 0; i < nelt; i++)
4519 sel[i] += nelt / 2;
4520 if (can_vec_perm_p (mode, false, sel))
4521 return true;
4526 if (dump_enabled_p ())
4527 dump_printf (MSG_MISSED_OPTIMIZATION,
4528 "permutaion op not supported by target.\n");
4529 return false;
4533 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4534 type VECTYPE. */
4536 bool
4537 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4539 return vect_lanes_optab_supported_p ("vec_store_lanes",
4540 vec_store_lanes_optab,
4541 vectype, count);
4545 /* Function vect_permute_store_chain.
4547 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4548 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4549 the data correctly for the stores. Return the final references for stores
4550 in RESULT_CHAIN.
4552 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4553 The input is 4 vectors each containing 8 elements. We assign a number to
4554 each element, the input sequence is:
4556 1st vec: 0 1 2 3 4 5 6 7
4557 2nd vec: 8 9 10 11 12 13 14 15
4558 3rd vec: 16 17 18 19 20 21 22 23
4559 4th vec: 24 25 26 27 28 29 30 31
4561 The output sequence should be:
4563 1st vec: 0 8 16 24 1 9 17 25
4564 2nd vec: 2 10 18 26 3 11 19 27
4565 3rd vec: 4 12 20 28 5 13 21 30
4566 4th vec: 6 14 22 30 7 15 23 31
4568 i.e., we interleave the contents of the four vectors in their order.
4570 We use interleave_high/low instructions to create such output. The input of
4571 each interleave_high/low operation is two vectors:
4572 1st vec 2nd vec
4573 0 1 2 3 4 5 6 7
4574 the even elements of the result vector are obtained left-to-right from the
4575 high/low elements of the first vector. The odd elements of the result are
4576 obtained left-to-right from the high/low elements of the second vector.
4577 The output of interleave_high will be: 0 4 1 5
4578 and of interleave_low: 2 6 3 7
4581 The permutation is done in log LENGTH stages. In each stage interleave_high
4582 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4583 where the first argument is taken from the first half of DR_CHAIN and the
4584 second argument from it's second half.
4585 In our example,
4587 I1: interleave_high (1st vec, 3rd vec)
4588 I2: interleave_low (1st vec, 3rd vec)
4589 I3: interleave_high (2nd vec, 4th vec)
4590 I4: interleave_low (2nd vec, 4th vec)
4592 The output for the first stage is:
4594 I1: 0 16 1 17 2 18 3 19
4595 I2: 4 20 5 21 6 22 7 23
4596 I3: 8 24 9 25 10 26 11 27
4597 I4: 12 28 13 29 14 30 15 31
4599 The output of the second stage, i.e. the final result is:
4601 I1: 0 8 16 24 1 9 17 25
4602 I2: 2 10 18 26 3 11 19 27
4603 I3: 4 12 20 28 5 13 21 30
4604 I4: 6 14 22 30 7 15 23 31. */
4606 void
4607 vect_permute_store_chain (vec<tree> dr_chain,
4608 unsigned int length,
4609 gimple *stmt,
4610 gimple_stmt_iterator *gsi,
4611 vec<tree> *result_chain)
4613 tree vect1, vect2, high, low;
4614 gimple *perm_stmt;
4615 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4616 tree perm_mask_low, perm_mask_high;
4617 tree data_ref;
4618 tree perm3_mask_low, perm3_mask_high;
4619 unsigned int i, n, log_length = exact_log2 (length);
4620 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4621 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4623 result_chain->quick_grow (length);
4624 memcpy (result_chain->address (), dr_chain.address (),
4625 length * sizeof (tree));
4627 if (length == 3)
4629 unsigned int j0 = 0, j1 = 0, j2 = 0;
4631 for (j = 0; j < 3; j++)
4633 int nelt0 = ((3 - j) * nelt) % 3;
4634 int nelt1 = ((3 - j) * nelt + 1) % 3;
4635 int nelt2 = ((3 - j) * nelt + 2) % 3;
4637 for (i = 0; i < nelt; i++)
4639 if (3 * i + nelt0 < nelt)
4640 sel[3 * i + nelt0] = j0++;
4641 if (3 * i + nelt1 < nelt)
4642 sel[3 * i + nelt1] = nelt + j1++;
4643 if (3 * i + nelt2 < nelt)
4644 sel[3 * i + nelt2] = 0;
4646 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4648 for (i = 0; i < nelt; i++)
4650 if (3 * i + nelt0 < nelt)
4651 sel[3 * i + nelt0] = 3 * i + nelt0;
4652 if (3 * i + nelt1 < nelt)
4653 sel[3 * i + nelt1] = 3 * i + nelt1;
4654 if (3 * i + nelt2 < nelt)
4655 sel[3 * i + nelt2] = nelt + j2++;
4657 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4659 vect1 = dr_chain[0];
4660 vect2 = dr_chain[1];
4662 /* Create interleaving stmt:
4663 low = VEC_PERM_EXPR <vect1, vect2,
4664 {j, nelt, *, j + 1, nelt + j + 1, *,
4665 j + 2, nelt + j + 2, *, ...}> */
4666 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4667 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4668 vect2, perm3_mask_low);
4669 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4671 vect1 = data_ref;
4672 vect2 = dr_chain[2];
4673 /* Create interleaving stmt:
4674 low = VEC_PERM_EXPR <vect1, vect2,
4675 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4676 6, 7, nelt + j + 2, ...}> */
4677 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4678 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4679 vect2, perm3_mask_high);
4680 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4681 (*result_chain)[j] = data_ref;
4684 else
4686 /* If length is not equal to 3 then only power of 2 is supported. */
4687 gcc_assert (pow2p_hwi (length));
4689 for (i = 0, n = nelt / 2; i < n; i++)
4691 sel[i * 2] = i;
4692 sel[i * 2 + 1] = i + nelt;
4694 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4696 for (i = 0; i < nelt; i++)
4697 sel[i] += nelt / 2;
4698 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4700 for (i = 0, n = log_length; i < n; i++)
4702 for (j = 0; j < length/2; j++)
4704 vect1 = dr_chain[j];
4705 vect2 = dr_chain[j+length/2];
4707 /* Create interleaving stmt:
4708 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4709 ...}> */
4710 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4711 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4712 vect2, perm_mask_high);
4713 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4714 (*result_chain)[2*j] = high;
4716 /* Create interleaving stmt:
4717 low = VEC_PERM_EXPR <vect1, vect2,
4718 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4719 ...}> */
4720 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4721 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4722 vect2, perm_mask_low);
4723 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4724 (*result_chain)[2*j+1] = low;
4726 memcpy (dr_chain.address (), result_chain->address (),
4727 length * sizeof (tree));
4732 /* Function vect_setup_realignment
4734 This function is called when vectorizing an unaligned load using
4735 the dr_explicit_realign[_optimized] scheme.
4736 This function generates the following code at the loop prolog:
4738 p = initial_addr;
4739 x msq_init = *(floor(p)); # prolog load
4740 realignment_token = call target_builtin;
4741 loop:
4742 x msq = phi (msq_init, ---)
4744 The stmts marked with x are generated only for the case of
4745 dr_explicit_realign_optimized.
4747 The code above sets up a new (vector) pointer, pointing to the first
4748 location accessed by STMT, and a "floor-aligned" load using that pointer.
4749 It also generates code to compute the "realignment-token" (if the relevant
4750 target hook was defined), and creates a phi-node at the loop-header bb
4751 whose arguments are the result of the prolog-load (created by this
4752 function) and the result of a load that takes place in the loop (to be
4753 created by the caller to this function).
4755 For the case of dr_explicit_realign_optimized:
4756 The caller to this function uses the phi-result (msq) to create the
4757 realignment code inside the loop, and sets up the missing phi argument,
4758 as follows:
4759 loop:
4760 msq = phi (msq_init, lsq)
4761 lsq = *(floor(p')); # load in loop
4762 result = realign_load (msq, lsq, realignment_token);
4764 For the case of dr_explicit_realign:
4765 loop:
4766 msq = *(floor(p)); # load in loop
4767 p' = p + (VS-1);
4768 lsq = *(floor(p')); # load in loop
4769 result = realign_load (msq, lsq, realignment_token);
4771 Input:
4772 STMT - (scalar) load stmt to be vectorized. This load accesses
4773 a memory location that may be unaligned.
4774 BSI - place where new code is to be inserted.
4775 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4776 is used.
4778 Output:
4779 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4780 target hook, if defined.
4781 Return value - the result of the loop-header phi node. */
4783 tree
4784 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4785 tree *realignment_token,
4786 enum dr_alignment_support alignment_support_scheme,
4787 tree init_addr,
4788 struct loop **at_loop)
4790 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4791 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4792 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4793 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4794 struct loop *loop = NULL;
4795 edge pe = NULL;
4796 tree scalar_dest = gimple_assign_lhs (stmt);
4797 tree vec_dest;
4798 gimple *inc;
4799 tree ptr;
4800 tree data_ref;
4801 basic_block new_bb;
4802 tree msq_init = NULL_TREE;
4803 tree new_temp;
4804 gphi *phi_stmt;
4805 tree msq = NULL_TREE;
4806 gimple_seq stmts = NULL;
4807 bool inv_p;
4808 bool compute_in_loop = false;
4809 bool nested_in_vect_loop = false;
4810 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4811 struct loop *loop_for_initial_load = NULL;
4813 if (loop_vinfo)
4815 loop = LOOP_VINFO_LOOP (loop_vinfo);
4816 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4819 gcc_assert (alignment_support_scheme == dr_explicit_realign
4820 || alignment_support_scheme == dr_explicit_realign_optimized);
4822 /* We need to generate three things:
4823 1. the misalignment computation
4824 2. the extra vector load (for the optimized realignment scheme).
4825 3. the phi node for the two vectors from which the realignment is
4826 done (for the optimized realignment scheme). */
4828 /* 1. Determine where to generate the misalignment computation.
4830 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4831 calculation will be generated by this function, outside the loop (in the
4832 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4833 caller, inside the loop.
4835 Background: If the misalignment remains fixed throughout the iterations of
4836 the loop, then both realignment schemes are applicable, and also the
4837 misalignment computation can be done outside LOOP. This is because we are
4838 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4839 are a multiple of VS (the Vector Size), and therefore the misalignment in
4840 different vectorized LOOP iterations is always the same.
4841 The problem arises only if the memory access is in an inner-loop nested
4842 inside LOOP, which is now being vectorized using outer-loop vectorization.
4843 This is the only case when the misalignment of the memory access may not
4844 remain fixed throughout the iterations of the inner-loop (as explained in
4845 detail in vect_supportable_dr_alignment). In this case, not only is the
4846 optimized realignment scheme not applicable, but also the misalignment
4847 computation (and generation of the realignment token that is passed to
4848 REALIGN_LOAD) have to be done inside the loop.
4850 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4851 or not, which in turn determines if the misalignment is computed inside
4852 the inner-loop, or outside LOOP. */
4854 if (init_addr != NULL_TREE || !loop_vinfo)
4856 compute_in_loop = true;
4857 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4861 /* 2. Determine where to generate the extra vector load.
4863 For the optimized realignment scheme, instead of generating two vector
4864 loads in each iteration, we generate a single extra vector load in the
4865 preheader of the loop, and in each iteration reuse the result of the
4866 vector load from the previous iteration. In case the memory access is in
4867 an inner-loop nested inside LOOP, which is now being vectorized using
4868 outer-loop vectorization, we need to determine whether this initial vector
4869 load should be generated at the preheader of the inner-loop, or can be
4870 generated at the preheader of LOOP. If the memory access has no evolution
4871 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4872 to be generated inside LOOP (in the preheader of the inner-loop). */
4874 if (nested_in_vect_loop)
4876 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4877 bool invariant_in_outerloop =
4878 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4879 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4881 else
4882 loop_for_initial_load = loop;
4883 if (at_loop)
4884 *at_loop = loop_for_initial_load;
4886 if (loop_for_initial_load)
4887 pe = loop_preheader_edge (loop_for_initial_load);
4889 /* 3. For the case of the optimized realignment, create the first vector
4890 load at the loop preheader. */
4892 if (alignment_support_scheme == dr_explicit_realign_optimized)
4894 /* Create msq_init = *(floor(p1)) in the loop preheader */
4895 gassign *new_stmt;
4897 gcc_assert (!compute_in_loop);
4898 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4899 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4900 NULL_TREE, &init_addr, NULL, &inc,
4901 true, &inv_p);
4902 if (TREE_CODE (ptr) == SSA_NAME)
4903 new_temp = copy_ssa_name (ptr);
4904 else
4905 new_temp = make_ssa_name (TREE_TYPE (ptr));
4906 new_stmt = gimple_build_assign
4907 (new_temp, BIT_AND_EXPR, ptr,
4908 build_int_cst (TREE_TYPE (ptr),
4909 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4910 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4911 gcc_assert (!new_bb);
4912 data_ref
4913 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4914 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4915 new_stmt = gimple_build_assign (vec_dest, data_ref);
4916 new_temp = make_ssa_name (vec_dest, new_stmt);
4917 gimple_assign_set_lhs (new_stmt, new_temp);
4918 if (pe)
4920 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4921 gcc_assert (!new_bb);
4923 else
4924 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4926 msq_init = gimple_assign_lhs (new_stmt);
4929 /* 4. Create realignment token using a target builtin, if available.
4930 It is done either inside the containing loop, or before LOOP (as
4931 determined above). */
4933 if (targetm.vectorize.builtin_mask_for_load)
4935 gcall *new_stmt;
4936 tree builtin_decl;
4938 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4939 if (!init_addr)
4941 /* Generate the INIT_ADDR computation outside LOOP. */
4942 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4943 NULL_TREE, loop);
4944 if (loop)
4946 pe = loop_preheader_edge (loop);
4947 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4948 gcc_assert (!new_bb);
4950 else
4951 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4954 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4955 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4956 vec_dest =
4957 vect_create_destination_var (scalar_dest,
4958 gimple_call_return_type (new_stmt));
4959 new_temp = make_ssa_name (vec_dest, new_stmt);
4960 gimple_call_set_lhs (new_stmt, new_temp);
4962 if (compute_in_loop)
4963 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4964 else
4966 /* Generate the misalignment computation outside LOOP. */
4967 pe = loop_preheader_edge (loop);
4968 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4969 gcc_assert (!new_bb);
4972 *realignment_token = gimple_call_lhs (new_stmt);
4974 /* The result of the CALL_EXPR to this builtin is determined from
4975 the value of the parameter and no global variables are touched
4976 which makes the builtin a "const" function. Requiring the
4977 builtin to have the "const" attribute makes it unnecessary
4978 to call mark_call_clobbered. */
4979 gcc_assert (TREE_READONLY (builtin_decl));
4982 if (alignment_support_scheme == dr_explicit_realign)
4983 return msq;
4985 gcc_assert (!compute_in_loop);
4986 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4989 /* 5. Create msq = phi <msq_init, lsq> in loop */
4991 pe = loop_preheader_edge (containing_loop);
4992 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4993 msq = make_ssa_name (vec_dest);
4994 phi_stmt = create_phi_node (msq, containing_loop->header);
4995 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4997 return msq;
5001 /* Function vect_grouped_load_supported.
5003 COUNT is the size of the load group (the number of statements plus the
5004 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5005 only one statement, with a gap of COUNT - 1.
5007 Returns true if a suitable permute exists. */
5009 bool
5010 vect_grouped_load_supported (tree vectype, bool single_element_p,
5011 unsigned HOST_WIDE_INT count)
5013 machine_mode mode = TYPE_MODE (vectype);
5015 /* If this is single-element interleaving with an element distance
5016 that leaves unused vector loads around punt - we at least create
5017 very sub-optimal code in that case (and blow up memory,
5018 see PR65518). */
5019 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5021 if (dump_enabled_p ())
5022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5023 "single-element interleaving not supported "
5024 "for not adjacent vector loads\n");
5025 return false;
5028 /* vect_permute_load_chain requires the group size to be equal to 3 or
5029 be a power of two. */
5030 if (count != 3 && exact_log2 (count) == -1)
5032 if (dump_enabled_p ())
5033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5034 "the size of the group of accesses"
5035 " is not a power of 2 or not equal to 3\n");
5036 return false;
5039 /* Check that the permutation is supported. */
5040 if (VECTOR_MODE_P (mode))
5042 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5043 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5045 if (count == 3)
5047 unsigned int k;
5048 for (k = 0; k < 3; k++)
5050 for (i = 0; i < nelt; i++)
5051 if (3 * i + k < 2 * nelt)
5052 sel[i] = 3 * i + k;
5053 else
5054 sel[i] = 0;
5055 if (!can_vec_perm_p (mode, false, sel))
5057 if (dump_enabled_p ())
5058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5059 "shuffle of 3 loads is not supported by"
5060 " target\n");
5061 return false;
5063 for (i = 0, j = 0; i < nelt; i++)
5064 if (3 * i + k < 2 * nelt)
5065 sel[i] = i;
5066 else
5067 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5068 if (!can_vec_perm_p (mode, false, sel))
5070 if (dump_enabled_p ())
5071 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5072 "shuffle of 3 loads is not supported by"
5073 " target\n");
5074 return false;
5077 return true;
5079 else
5081 /* If length is not equal to 3 then only power of 2 is supported. */
5082 gcc_assert (pow2p_hwi (count));
5083 for (i = 0; i < nelt; i++)
5084 sel[i] = i * 2;
5085 if (can_vec_perm_p (mode, false, sel))
5087 for (i = 0; i < nelt; i++)
5088 sel[i] = i * 2 + 1;
5089 if (can_vec_perm_p (mode, false, sel))
5090 return true;
5095 if (dump_enabled_p ())
5096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5097 "extract even/odd not supported by target\n");
5098 return false;
5101 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5102 type VECTYPE. */
5104 bool
5105 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5107 return vect_lanes_optab_supported_p ("vec_load_lanes",
5108 vec_load_lanes_optab,
5109 vectype, count);
5112 /* Function vect_permute_load_chain.
5114 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5115 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5116 the input data correctly. Return the final references for loads in
5117 RESULT_CHAIN.
5119 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5120 The input is 4 vectors each containing 8 elements. We assign a number to each
5121 element, the input sequence is:
5123 1st vec: 0 1 2 3 4 5 6 7
5124 2nd vec: 8 9 10 11 12 13 14 15
5125 3rd vec: 16 17 18 19 20 21 22 23
5126 4th vec: 24 25 26 27 28 29 30 31
5128 The output sequence should be:
5130 1st vec: 0 4 8 12 16 20 24 28
5131 2nd vec: 1 5 9 13 17 21 25 29
5132 3rd vec: 2 6 10 14 18 22 26 30
5133 4th vec: 3 7 11 15 19 23 27 31
5135 i.e., the first output vector should contain the first elements of each
5136 interleaving group, etc.
5138 We use extract_even/odd instructions to create such output. The input of
5139 each extract_even/odd operation is two vectors
5140 1st vec 2nd vec
5141 0 1 2 3 4 5 6 7
5143 and the output is the vector of extracted even/odd elements. The output of
5144 extract_even will be: 0 2 4 6
5145 and of extract_odd: 1 3 5 7
5148 The permutation is done in log LENGTH stages. In each stage extract_even
5149 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5150 their order. In our example,
5152 E1: extract_even (1st vec, 2nd vec)
5153 E2: extract_odd (1st vec, 2nd vec)
5154 E3: extract_even (3rd vec, 4th vec)
5155 E4: extract_odd (3rd vec, 4th vec)
5157 The output for the first stage will be:
5159 E1: 0 2 4 6 8 10 12 14
5160 E2: 1 3 5 7 9 11 13 15
5161 E3: 16 18 20 22 24 26 28 30
5162 E4: 17 19 21 23 25 27 29 31
5164 In order to proceed and create the correct sequence for the next stage (or
5165 for the correct output, if the second stage is the last one, as in our
5166 example), we first put the output of extract_even operation and then the
5167 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5168 The input for the second stage is:
5170 1st vec (E1): 0 2 4 6 8 10 12 14
5171 2nd vec (E3): 16 18 20 22 24 26 28 30
5172 3rd vec (E2): 1 3 5 7 9 11 13 15
5173 4th vec (E4): 17 19 21 23 25 27 29 31
5175 The output of the second stage:
5177 E1: 0 4 8 12 16 20 24 28
5178 E2: 2 6 10 14 18 22 26 30
5179 E3: 1 5 9 13 17 21 25 29
5180 E4: 3 7 11 15 19 23 27 31
5182 And RESULT_CHAIN after reordering:
5184 1st vec (E1): 0 4 8 12 16 20 24 28
5185 2nd vec (E3): 1 5 9 13 17 21 25 29
5186 3rd vec (E2): 2 6 10 14 18 22 26 30
5187 4th vec (E4): 3 7 11 15 19 23 27 31. */
5189 static void
5190 vect_permute_load_chain (vec<tree> dr_chain,
5191 unsigned int length,
5192 gimple *stmt,
5193 gimple_stmt_iterator *gsi,
5194 vec<tree> *result_chain)
5196 tree data_ref, first_vect, second_vect;
5197 tree perm_mask_even, perm_mask_odd;
5198 tree perm3_mask_low, perm3_mask_high;
5199 gimple *perm_stmt;
5200 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5201 unsigned int i, j, log_length = exact_log2 (length);
5202 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5203 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5205 result_chain->quick_grow (length);
5206 memcpy (result_chain->address (), dr_chain.address (),
5207 length * sizeof (tree));
5209 if (length == 3)
5211 unsigned int k;
5213 for (k = 0; k < 3; k++)
5215 for (i = 0; i < nelt; i++)
5216 if (3 * i + k < 2 * nelt)
5217 sel[i] = 3 * i + k;
5218 else
5219 sel[i] = 0;
5220 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5222 for (i = 0, j = 0; i < nelt; i++)
5223 if (3 * i + k < 2 * nelt)
5224 sel[i] = i;
5225 else
5226 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5228 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5230 first_vect = dr_chain[0];
5231 second_vect = dr_chain[1];
5233 /* Create interleaving stmt (low part of):
5234 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5235 ...}> */
5236 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5237 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5238 second_vect, perm3_mask_low);
5239 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5241 /* Create interleaving stmt (high part of):
5242 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5243 ...}> */
5244 first_vect = data_ref;
5245 second_vect = dr_chain[2];
5246 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5247 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5248 second_vect, perm3_mask_high);
5249 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5250 (*result_chain)[k] = data_ref;
5253 else
5255 /* If length is not equal to 3 then only power of 2 is supported. */
5256 gcc_assert (pow2p_hwi (length));
5258 for (i = 0; i < nelt; ++i)
5259 sel[i] = i * 2;
5260 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5262 for (i = 0; i < nelt; ++i)
5263 sel[i] = i * 2 + 1;
5264 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5266 for (i = 0; i < log_length; i++)
5268 for (j = 0; j < length; j += 2)
5270 first_vect = dr_chain[j];
5271 second_vect = dr_chain[j+1];
5273 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5274 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5275 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5276 first_vect, second_vect,
5277 perm_mask_even);
5278 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5279 (*result_chain)[j/2] = data_ref;
5281 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5282 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5283 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5284 first_vect, second_vect,
5285 perm_mask_odd);
5286 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5287 (*result_chain)[j/2+length/2] = data_ref;
5289 memcpy (dr_chain.address (), result_chain->address (),
5290 length * sizeof (tree));
5295 /* Function vect_shift_permute_load_chain.
5297 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5298 sequence of stmts to reorder the input data accordingly.
5299 Return the final references for loads in RESULT_CHAIN.
5300 Return true if successed, false otherwise.
5302 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5303 The input is 3 vectors each containing 8 elements. We assign a
5304 number to each element, the input sequence is:
5306 1st vec: 0 1 2 3 4 5 6 7
5307 2nd vec: 8 9 10 11 12 13 14 15
5308 3rd vec: 16 17 18 19 20 21 22 23
5310 The output sequence should be:
5312 1st vec: 0 3 6 9 12 15 18 21
5313 2nd vec: 1 4 7 10 13 16 19 22
5314 3rd vec: 2 5 8 11 14 17 20 23
5316 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5318 First we shuffle all 3 vectors to get correct elements order:
5320 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5321 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5322 3rd vec: (16 19 22) (17 20 23) (18 21)
5324 Next we unite and shift vector 3 times:
5326 1st step:
5327 shift right by 6 the concatenation of:
5328 "1st vec" and "2nd vec"
5329 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5330 "2nd vec" and "3rd vec"
5331 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5332 "3rd vec" and "1st vec"
5333 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5334 | New vectors |
5336 So that now new vectors are:
5338 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5339 2nd vec: (10 13) (16 19 22) (17 20 23)
5340 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5342 2nd step:
5343 shift right by 5 the concatenation of:
5344 "1st vec" and "3rd vec"
5345 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5346 "2nd vec" and "1st vec"
5347 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5348 "3rd vec" and "2nd vec"
5349 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5350 | New vectors |
5352 So that now new vectors are:
5354 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5355 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5356 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5358 3rd step:
5359 shift right by 5 the concatenation of:
5360 "1st vec" and "1st vec"
5361 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5362 shift right by 3 the concatenation of:
5363 "2nd vec" and "2nd vec"
5364 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5365 | New vectors |
5367 So that now all vectors are READY:
5368 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5369 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5370 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5372 This algorithm is faster than one in vect_permute_load_chain if:
5373 1. "shift of a concatination" is faster than general permutation.
5374 This is usually so.
5375 2. The TARGET machine can't execute vector instructions in parallel.
5376 This is because each step of the algorithm depends on previous.
5377 The algorithm in vect_permute_load_chain is much more parallel.
5379 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5382 static bool
5383 vect_shift_permute_load_chain (vec<tree> dr_chain,
5384 unsigned int length,
5385 gimple *stmt,
5386 gimple_stmt_iterator *gsi,
5387 vec<tree> *result_chain)
5389 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5390 tree perm2_mask1, perm2_mask2, perm3_mask;
5391 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5392 gimple *perm_stmt;
5394 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5395 unsigned int i;
5396 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5397 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5398 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5399 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5401 result_chain->quick_grow (length);
5402 memcpy (result_chain->address (), dr_chain.address (),
5403 length * sizeof (tree));
5405 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5407 unsigned int j, log_length = exact_log2 (length);
5408 for (i = 0; i < nelt / 2; ++i)
5409 sel[i] = i * 2;
5410 for (i = 0; i < nelt / 2; ++i)
5411 sel[nelt / 2 + i] = i * 2 + 1;
5412 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5414 if (dump_enabled_p ())
5415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5416 "shuffle of 2 fields structure is not \
5417 supported by target\n");
5418 return false;
5420 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5422 for (i = 0; i < nelt / 2; ++i)
5423 sel[i] = i * 2 + 1;
5424 for (i = 0; i < nelt / 2; ++i)
5425 sel[nelt / 2 + i] = i * 2;
5426 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5428 if (dump_enabled_p ())
5429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5430 "shuffle of 2 fields structure is not \
5431 supported by target\n");
5432 return false;
5434 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5436 /* Generating permutation constant to shift all elements.
5437 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5438 for (i = 0; i < nelt; i++)
5439 sel[i] = nelt / 2 + i;
5440 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5442 if (dump_enabled_p ())
5443 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5444 "shift permutation is not supported by target\n");
5445 return false;
5447 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5449 /* Generating permutation constant to select vector from 2.
5450 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5451 for (i = 0; i < nelt / 2; i++)
5452 sel[i] = i;
5453 for (i = nelt / 2; i < nelt; i++)
5454 sel[i] = nelt + i;
5455 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5457 if (dump_enabled_p ())
5458 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5459 "select is not supported by target\n");
5460 return false;
5462 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5464 for (i = 0; i < log_length; i++)
5466 for (j = 0; j < length; j += 2)
5468 first_vect = dr_chain[j];
5469 second_vect = dr_chain[j + 1];
5471 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5472 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5473 first_vect, first_vect,
5474 perm2_mask1);
5475 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5476 vect[0] = data_ref;
5478 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5479 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5480 second_vect, second_vect,
5481 perm2_mask2);
5482 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5483 vect[1] = data_ref;
5485 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5486 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5487 vect[0], vect[1], shift1_mask);
5488 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5489 (*result_chain)[j/2 + length/2] = data_ref;
5491 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5492 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5493 vect[0], vect[1], select_mask);
5494 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5495 (*result_chain)[j/2] = data_ref;
5497 memcpy (dr_chain.address (), result_chain->address (),
5498 length * sizeof (tree));
5500 return true;
5502 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5504 unsigned int k = 0, l = 0;
5506 /* Generating permutation constant to get all elements in rigth order.
5507 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5508 for (i = 0; i < nelt; i++)
5510 if (3 * k + (l % 3) >= nelt)
5512 k = 0;
5513 l += (3 - (nelt % 3));
5515 sel[i] = 3 * k + (l % 3);
5516 k++;
5518 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5520 if (dump_enabled_p ())
5521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5522 "shuffle of 3 fields structure is not \
5523 supported by target\n");
5524 return false;
5526 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5528 /* Generating permutation constant to shift all elements.
5529 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5530 for (i = 0; i < nelt; i++)
5531 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5532 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5534 if (dump_enabled_p ())
5535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5536 "shift permutation is not supported by target\n");
5537 return false;
5539 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5541 /* Generating permutation constant to shift all elements.
5542 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5543 for (i = 0; i < nelt; i++)
5544 sel[i] = 2 * (nelt / 3) + 1 + i;
5545 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5547 if (dump_enabled_p ())
5548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5549 "shift permutation is not supported by target\n");
5550 return false;
5552 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5554 /* Generating permutation constant to shift all elements.
5555 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5556 for (i = 0; i < nelt; i++)
5557 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5558 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5560 if (dump_enabled_p ())
5561 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5562 "shift permutation is not supported by target\n");
5563 return false;
5565 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5567 /* Generating permutation constant to shift all elements.
5568 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5569 for (i = 0; i < nelt; i++)
5570 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5571 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5573 if (dump_enabled_p ())
5574 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5575 "shift permutation is not supported by target\n");
5576 return false;
5578 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5580 for (k = 0; k < 3; k++)
5582 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5583 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5584 dr_chain[k], dr_chain[k],
5585 perm3_mask);
5586 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5587 vect[k] = data_ref;
5590 for (k = 0; k < 3; k++)
5592 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5593 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5594 vect[k % 3], vect[(k + 1) % 3],
5595 shift1_mask);
5596 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5597 vect_shift[k] = data_ref;
5600 for (k = 0; k < 3; k++)
5602 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5603 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5604 vect_shift[(4 - k) % 3],
5605 vect_shift[(3 - k) % 3],
5606 shift2_mask);
5607 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5608 vect[k] = data_ref;
5611 (*result_chain)[3 - (nelt % 3)] = vect[2];
5613 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5614 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5615 vect[0], shift3_mask);
5616 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5617 (*result_chain)[nelt % 3] = data_ref;
5619 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5620 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5621 vect[1], shift4_mask);
5622 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5623 (*result_chain)[0] = data_ref;
5624 return true;
5626 return false;
5629 /* Function vect_transform_grouped_load.
5631 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5632 to perform their permutation and ascribe the result vectorized statements to
5633 the scalar statements.
5636 void
5637 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5638 gimple_stmt_iterator *gsi)
5640 machine_mode mode;
5641 vec<tree> result_chain = vNULL;
5643 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5644 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5645 vectors, that are ready for vector computation. */
5646 result_chain.create (size);
5648 /* If reassociation width for vector type is 2 or greater target machine can
5649 execute 2 or more vector instructions in parallel. Otherwise try to
5650 get chain for loads group using vect_shift_permute_load_chain. */
5651 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5652 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5653 || pow2p_hwi (size)
5654 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5655 gsi, &result_chain))
5656 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5657 vect_record_grouped_load_vectors (stmt, result_chain);
5658 result_chain.release ();
5661 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5662 generated as part of the vectorization of STMT. Assign the statement
5663 for each vector to the associated scalar statement. */
5665 void
5666 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5668 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5669 gimple *next_stmt, *new_stmt;
5670 unsigned int i, gap_count;
5671 tree tmp_data_ref;
5673 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5674 Since we scan the chain starting from it's first node, their order
5675 corresponds the order of data-refs in RESULT_CHAIN. */
5676 next_stmt = first_stmt;
5677 gap_count = 1;
5678 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5680 if (!next_stmt)
5681 break;
5683 /* Skip the gaps. Loads created for the gaps will be removed by dead
5684 code elimination pass later. No need to check for the first stmt in
5685 the group, since it always exists.
5686 GROUP_GAP is the number of steps in elements from the previous
5687 access (if there is no gap GROUP_GAP is 1). We skip loads that
5688 correspond to the gaps. */
5689 if (next_stmt != first_stmt
5690 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5692 gap_count++;
5693 continue;
5696 while (next_stmt)
5698 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5699 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5700 copies, and we put the new vector statement in the first available
5701 RELATED_STMT. */
5702 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5703 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5704 else
5706 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5708 gimple *prev_stmt =
5709 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5710 gimple *rel_stmt =
5711 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5712 while (rel_stmt)
5714 prev_stmt = rel_stmt;
5715 rel_stmt =
5716 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5719 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5720 new_stmt;
5724 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5725 gap_count = 1;
5726 /* If NEXT_STMT accesses the same DR as the previous statement,
5727 put the same TMP_DATA_REF as its vectorized statement; otherwise
5728 get the next data-ref from RESULT_CHAIN. */
5729 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5730 break;
5735 /* Function vect_force_dr_alignment_p.
5737 Returns whether the alignment of a DECL can be forced to be aligned
5738 on ALIGNMENT bit boundary. */
5740 bool
5741 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5743 if (!VAR_P (decl))
5744 return false;
5746 if (decl_in_symtab_p (decl)
5747 && !symtab_node::get (decl)->can_increase_alignment_p ())
5748 return false;
5750 if (TREE_STATIC (decl))
5751 return (alignment <= MAX_OFILE_ALIGNMENT);
5752 else
5753 return (alignment <= MAX_STACK_ALIGNMENT);
5757 /* Return whether the data reference DR is supported with respect to its
5758 alignment.
5759 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5760 it is aligned, i.e., check if it is possible to vectorize it with different
5761 alignment. */
5763 enum dr_alignment_support
5764 vect_supportable_dr_alignment (struct data_reference *dr,
5765 bool check_aligned_accesses)
5767 gimple *stmt = DR_STMT (dr);
5768 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5769 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5770 machine_mode mode = TYPE_MODE (vectype);
5771 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5772 struct loop *vect_loop = NULL;
5773 bool nested_in_vect_loop = false;
5775 if (aligned_access_p (dr) && !check_aligned_accesses)
5776 return dr_aligned;
5778 /* For now assume all conditional loads/stores support unaligned
5779 access without any special code. */
5780 if (is_gimple_call (stmt)
5781 && gimple_call_internal_p (stmt)
5782 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5783 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5784 return dr_unaligned_supported;
5786 if (loop_vinfo)
5788 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5789 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5792 /* Possibly unaligned access. */
5794 /* We can choose between using the implicit realignment scheme (generating
5795 a misaligned_move stmt) and the explicit realignment scheme (generating
5796 aligned loads with a REALIGN_LOAD). There are two variants to the
5797 explicit realignment scheme: optimized, and unoptimized.
5798 We can optimize the realignment only if the step between consecutive
5799 vector loads is equal to the vector size. Since the vector memory
5800 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5801 is guaranteed that the misalignment amount remains the same throughout the
5802 execution of the vectorized loop. Therefore, we can create the
5803 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5804 at the loop preheader.
5806 However, in the case of outer-loop vectorization, when vectorizing a
5807 memory access in the inner-loop nested within the LOOP that is now being
5808 vectorized, while it is guaranteed that the misalignment of the
5809 vectorized memory access will remain the same in different outer-loop
5810 iterations, it is *not* guaranteed that is will remain the same throughout
5811 the execution of the inner-loop. This is because the inner-loop advances
5812 with the original scalar step (and not in steps of VS). If the inner-loop
5813 step happens to be a multiple of VS, then the misalignment remains fixed
5814 and we can use the optimized realignment scheme. For example:
5816 for (i=0; i<N; i++)
5817 for (j=0; j<M; j++)
5818 s += a[i+j];
5820 When vectorizing the i-loop in the above example, the step between
5821 consecutive vector loads is 1, and so the misalignment does not remain
5822 fixed across the execution of the inner-loop, and the realignment cannot
5823 be optimized (as illustrated in the following pseudo vectorized loop):
5825 for (i=0; i<N; i+=4)
5826 for (j=0; j<M; j++){
5827 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5828 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5829 // (assuming that we start from an aligned address).
5832 We therefore have to use the unoptimized realignment scheme:
5834 for (i=0; i<N; i+=4)
5835 for (j=k; j<M; j+=4)
5836 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5837 // that the misalignment of the initial address is
5838 // 0).
5840 The loop can then be vectorized as follows:
5842 for (k=0; k<4; k++){
5843 rt = get_realignment_token (&vp[k]);
5844 for (i=0; i<N; i+=4){
5845 v1 = vp[i+k];
5846 for (j=k; j<M; j+=4){
5847 v2 = vp[i+j+VS-1];
5848 va = REALIGN_LOAD <v1,v2,rt>;
5849 vs += va;
5850 v1 = v2;
5853 } */
5855 if (DR_IS_READ (dr))
5857 bool is_packed = false;
5858 tree type = (TREE_TYPE (DR_REF (dr)));
5860 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5861 && (!targetm.vectorize.builtin_mask_for_load
5862 || targetm.vectorize.builtin_mask_for_load ()))
5864 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5866 /* If we are doing SLP then the accesses need not have the
5867 same alignment, instead it depends on the SLP group size. */
5868 if (loop_vinfo
5869 && STMT_SLP_TYPE (stmt_info)
5870 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5871 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5872 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5874 else if (!loop_vinfo
5875 || (nested_in_vect_loop
5876 && (TREE_INT_CST_LOW (DR_STEP (dr))
5877 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5878 return dr_explicit_realign;
5879 else
5880 return dr_explicit_realign_optimized;
5882 if (!known_alignment_for_access_p (dr))
5883 is_packed = not_size_aligned (DR_REF (dr));
5885 if (targetm.vectorize.support_vector_misalignment
5886 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5887 /* Can't software pipeline the loads, but can at least do them. */
5888 return dr_unaligned_supported;
5890 else
5892 bool is_packed = false;
5893 tree type = (TREE_TYPE (DR_REF (dr)));
5895 if (!known_alignment_for_access_p (dr))
5896 is_packed = not_size_aligned (DR_REF (dr));
5898 if (targetm.vectorize.support_vector_misalignment
5899 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5900 return dr_unaligned_supported;
5903 /* Unsupported. */
5904 return dr_unaligned_unsupported;