2017-02-20 Paul Thomas <pault@gcc.gnu.org>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob33a32b818e90f90a8ff1ca066fd0cfdff6c85f70
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
54 /* Return true if load- or store-lanes optab OPTAB is implemented for
55 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
57 static bool
58 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
59 tree vectype, unsigned HOST_WIDE_INT count)
61 machine_mode mode, array_mode;
62 bool limit_p;
64 mode = TYPE_MODE (vectype);
65 limit_p = !targetm.array_mode_supported_p (mode, count);
66 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
67 MODE_INT, limit_p);
69 if (array_mode == BLKmode)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
74 GET_MODE_NAME (mode), count);
75 return false;
78 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
80 if (dump_enabled_p ())
81 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
82 "cannot use %s<%s><%s>\n", name,
83 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
84 return false;
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_NOTE, vect_location,
89 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
90 GET_MODE_NAME (mode));
92 return true;
96 /* Return the smallest scalar part of STMT.
97 This is used to determine the vectype of the stmt. We generally set the
98 vectype according to the type of the result (lhs). For stmts whose
99 result-type is different than the type of the arguments (e.g., demotion,
100 promotion), vectype will be reset appropriately (later). Note that we have
101 to visit the smallest datatype in this function, because that determines the
102 VF. If the smallest datatype in the loop is present only as the rhs of a
103 promotion operation - we'd miss it.
104 Such a case, where a variable of this datatype does not appear in the lhs
105 anywhere in the loop, can only occur if it's an invariant: e.g.:
106 'int_x = (int) short_inv', which we'd expect to have been optimized away by
107 invariant motion. However, we cannot rely on invariant motion to always
108 take invariants out of the loop, and so in the case of promotion we also
109 have to check the rhs.
110 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
111 types. */
113 tree
114 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
115 HOST_WIDE_INT *rhs_size_unit)
117 tree scalar_type = gimple_expr_type (stmt);
118 HOST_WIDE_INT lhs, rhs;
120 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
122 if (is_gimple_assign (stmt)
123 && (gimple_assign_cast_p (stmt)
124 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
125 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
126 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
128 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
130 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
131 if (rhs < lhs)
132 scalar_type = rhs_type;
135 *lhs_size_unit = lhs;
136 *rhs_size_unit = rhs;
137 return scalar_type;
141 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
142 tested at run-time. Return TRUE if DDR was successfully inserted.
143 Return false if versioning is not supported. */
145 static bool
146 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
148 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
150 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
151 return false;
153 if (dump_enabled_p ())
155 dump_printf_loc (MSG_NOTE, vect_location,
156 "mark for run-time aliasing test between ");
157 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
158 dump_printf (MSG_NOTE, " and ");
159 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
160 dump_printf (MSG_NOTE, "\n");
163 if (optimize_loop_nest_for_size_p (loop))
165 if (dump_enabled_p ())
166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
167 "versioning not supported when optimizing"
168 " for size.\n");
169 return false;
172 /* FORNOW: We don't support versioning with outer-loop vectorization. */
173 if (loop->inner)
175 if (dump_enabled_p ())
176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
177 "versioning not yet supported for outer-loops.\n");
178 return false;
181 /* FORNOW: We don't support creating runtime alias tests for non-constant
182 step. */
183 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
184 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
186 if (dump_enabled_p ())
187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
188 "versioning not yet supported for non-constant "
189 "step\n");
190 return false;
193 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
194 return true;
198 /* Function vect_analyze_data_ref_dependence.
200 Return TRUE if there (might) exist a dependence between a memory-reference
201 DRA and a memory-reference DRB. When versioning for alias may check a
202 dependence at run-time, return FALSE. Adjust *MAX_VF according to
203 the data dependence. */
205 static bool
206 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
207 loop_vec_info loop_vinfo, int *max_vf)
209 unsigned int i;
210 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
211 struct data_reference *dra = DDR_A (ddr);
212 struct data_reference *drb = DDR_B (ddr);
213 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
214 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
215 lambda_vector dist_v;
216 unsigned int loop_depth;
218 /* In loop analysis all data references should be vectorizable. */
219 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
220 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
221 gcc_unreachable ();
223 /* Independent data accesses. */
224 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
225 return false;
227 if (dra == drb
228 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
229 return false;
231 /* We do not have to consider dependences between accesses that belong
232 to the same group. */
233 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
234 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
235 return false;
237 /* Even if we have an anti-dependence then, as the vectorized loop covers at
238 least two scalar iterations, there is always also a true dependence.
239 As the vectorizer does not re-order loads and stores we can ignore
240 the anti-dependence if TBAA can disambiguate both DRs similar to the
241 case with known negative distance anti-dependences (positive
242 distance anti-dependences would violate TBAA constraints). */
243 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
244 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
245 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
246 get_alias_set (DR_REF (drb))))
247 return false;
249 /* Unknown data dependence. */
250 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
252 /* If user asserted safelen consecutive iterations can be
253 executed concurrently, assume independence. */
254 if (loop->safelen >= 2)
256 if (loop->safelen < *max_vf)
257 *max_vf = loop->safelen;
258 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
259 return false;
262 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
263 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
265 if (dump_enabled_p ())
267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
268 "versioning for alias not supported for: "
269 "can't determine dependence between ");
270 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
271 DR_REF (dra));
272 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
273 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
274 DR_REF (drb));
275 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
277 return true;
280 if (dump_enabled_p ())
282 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
283 "versioning for alias required: "
284 "can't determine dependence between ");
285 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
286 DR_REF (dra));
287 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
288 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
289 DR_REF (drb));
290 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
293 /* Add to list of ddrs that need to be tested at run-time. */
294 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
297 /* Known data dependence. */
298 if (DDR_NUM_DIST_VECTS (ddr) == 0)
300 /* If user asserted safelen consecutive iterations can be
301 executed concurrently, assume independence. */
302 if (loop->safelen >= 2)
304 if (loop->safelen < *max_vf)
305 *max_vf = loop->safelen;
306 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
307 return false;
310 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
311 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
313 if (dump_enabled_p ())
315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
316 "versioning for alias not supported for: "
317 "bad dist vector for ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
319 DR_REF (dra));
320 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
321 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
322 DR_REF (drb));
323 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
325 return true;
328 if (dump_enabled_p ())
330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
331 "versioning for alias required: "
332 "bad dist vector for ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
334 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
335 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
336 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
338 /* Add to list of ddrs that need to be tested at run-time. */
339 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
342 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
343 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
345 int dist = dist_v[loop_depth];
347 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location,
349 "dependence distance = %d.\n", dist);
351 if (dist == 0)
353 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE, vect_location,
356 "dependence distance == 0 between ");
357 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
358 dump_printf (MSG_NOTE, " and ");
359 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
360 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
363 /* When we perform grouped accesses and perform implicit CSE
364 by detecting equal accesses and doing disambiguation with
365 runtime alias tests like for
366 .. = a[i];
367 .. = a[i+1];
368 a[i] = ..;
369 a[i+1] = ..;
370 *p = ..;
371 .. = a[i];
372 .. = a[i+1];
373 where we will end up loading { a[i], a[i+1] } once, make
374 sure that inserting group loads before the first load and
375 stores after the last store will do the right thing.
376 Similar for groups like
377 a[i] = ...;
378 ... = a[i];
379 a[i+1] = ...;
380 where loads from the group interleave with the store. */
381 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
382 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
384 gimple *earlier_stmt;
385 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
386 if (DR_IS_WRITE
387 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
391 "READ_WRITE dependence in interleaving."
392 "\n");
393 return true;
397 continue;
400 if (dist > 0 && DDR_REVERSED_P (ddr))
402 /* If DDR_REVERSED_P the order of the data-refs in DDR was
403 reversed (to make distance vector positive), and the actual
404 distance is negative. */
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "dependence distance negative.\n");
408 /* Record a negative dependence distance to later limit the
409 amount of stmt copying / unrolling we can perform.
410 Only need to handle read-after-write dependence. */
411 if (DR_IS_READ (drb)
412 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
413 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
414 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
415 continue;
418 if (abs (dist) >= 2
419 && abs (dist) < *max_vf)
421 /* The dependence distance requires reduction of the maximal
422 vectorization factor. */
423 *max_vf = abs (dist);
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_NOTE, vect_location,
426 "adjusting maximal vectorization factor to %i\n",
427 *max_vf);
430 if (abs (dist) >= *max_vf)
432 /* Dependence distance does not create dependence, as far as
433 vectorization is concerned, in this case. */
434 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE, vect_location,
436 "dependence distance >= VF.\n");
437 continue;
440 if (dump_enabled_p ())
442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
443 "not vectorized, possible dependence "
444 "between data-refs ");
445 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
446 dump_printf (MSG_NOTE, " and ");
447 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
448 dump_printf (MSG_NOTE, "\n");
451 return true;
454 return false;
457 /* Function vect_analyze_data_ref_dependences.
459 Examine all the data references in the loop, and make sure there do not
460 exist any data dependences between them. Set *MAX_VF according to
461 the maximum vectorization factor the data dependences allow. */
463 bool
464 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
466 unsigned int i;
467 struct data_dependence_relation *ddr;
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE, vect_location,
471 "=== vect_analyze_data_ref_dependences ===\n");
473 LOOP_VINFO_DDRS (loop_vinfo)
474 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
475 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
476 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
477 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
478 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
479 &LOOP_VINFO_DDRS (loop_vinfo),
480 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
481 return false;
483 /* For epilogues we either have no aliases or alias versioning
484 was applied to original loop. Therefore we may just get max_vf
485 using VF of original loop. */
486 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
487 *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
488 else
489 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
490 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
491 return false;
493 return true;
497 /* Function vect_slp_analyze_data_ref_dependence.
499 Return TRUE if there (might) exist a dependence between a memory-reference
500 DRA and a memory-reference DRB. When versioning for alias may check a
501 dependence at run-time, return FALSE. Adjust *MAX_VF according to
502 the data dependence. */
504 static bool
505 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
507 struct data_reference *dra = DDR_A (ddr);
508 struct data_reference *drb = DDR_B (ddr);
510 /* We need to check dependences of statements marked as unvectorizable
511 as well, they still can prohibit vectorization. */
513 /* Independent data accesses. */
514 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
515 return false;
517 if (dra == drb)
518 return false;
520 /* Read-read is OK. */
521 if (DR_IS_READ (dra) && DR_IS_READ (drb))
522 return false;
524 /* If dra and drb are part of the same interleaving chain consider
525 them independent. */
526 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
527 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
528 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
529 return false;
531 /* Unknown data dependence. */
532 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
534 if (dump_enabled_p ())
536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
537 "can't determine dependence between ");
538 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
539 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
541 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
544 else if (dump_enabled_p ())
546 dump_printf_loc (MSG_NOTE, vect_location,
547 "determined dependence between ");
548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
549 dump_printf (MSG_NOTE, " and ");
550 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
551 dump_printf (MSG_NOTE, "\n");
554 return true;
558 /* Analyze dependences involved in the transform of SLP NODE. STORES
559 contain the vector of scalar stores of this instance if we are
560 disambiguating the loads. */
562 static bool
563 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
564 vec<gimple *> stores, gimple *last_store)
566 /* This walks over all stmts involved in the SLP load/store done
567 in NODE verifying we can sink them up to the last stmt in the
568 group. */
569 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
570 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
572 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
573 if (access == last_access)
574 continue;
575 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
576 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
577 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
579 gimple *stmt = gsi_stmt (gsi);
580 if (! gimple_vuse (stmt)
581 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
582 continue;
584 /* If we couldn't record a (single) data reference for this
585 stmt we have to give up. */
586 /* ??? Here and below if dependence analysis fails we can resort
587 to the alias oracle which can handle more kinds of stmts. */
588 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
589 if (!dr_b)
590 return false;
592 bool dependent = false;
593 /* If we run into a store of this same instance (we've just
594 marked those) then delay dependence checking until we run
595 into the last store because this is where it will have
596 been sunk to (and we verify if we can do that as well). */
597 if (gimple_visited_p (stmt))
599 if (stmt != last_store)
600 continue;
601 unsigned i;
602 gimple *store;
603 FOR_EACH_VEC_ELT (stores, i, store)
605 data_reference *store_dr
606 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
607 ddr_p ddr = initialize_data_dependence_relation
608 (dr_a, store_dr, vNULL);
609 dependent = vect_slp_analyze_data_ref_dependence (ddr);
610 free_dependence_relation (ddr);
611 if (dependent)
612 break;
615 else
617 ddr_p ddr = initialize_data_dependence_relation (dr_a,
618 dr_b, vNULL);
619 dependent = vect_slp_analyze_data_ref_dependence (ddr);
620 free_dependence_relation (ddr);
622 if (dependent)
623 return false;
626 return true;
630 /* Function vect_analyze_data_ref_dependences.
632 Examine all the data references in the basic-block, and make sure there
633 do not exist any data dependences between them. Set *MAX_VF according to
634 the maximum vectorization factor the data dependences allow. */
636 bool
637 vect_slp_analyze_instance_dependence (slp_instance instance)
639 if (dump_enabled_p ())
640 dump_printf_loc (MSG_NOTE, vect_location,
641 "=== vect_slp_analyze_instance_dependence ===\n");
643 /* The stores of this instance are at the root of the SLP tree. */
644 slp_tree store = SLP_INSTANCE_TREE (instance);
645 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
646 store = NULL;
648 /* Verify we can sink stores to the vectorized stmt insert location. */
649 gimple *last_store = NULL;
650 if (store)
652 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
653 return false;
655 /* Mark stores in this instance and remember the last one. */
656 last_store = vect_find_last_scalar_stmt_in_slp (store);
657 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
658 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
661 bool res = true;
663 /* Verify we can sink loads to the vectorized stmt insert location,
664 special-casing stores of this instance. */
665 slp_tree load;
666 unsigned int i;
667 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
668 if (! vect_slp_analyze_node_dependences (instance, load,
669 store
670 ? SLP_TREE_SCALAR_STMTS (store)
671 : vNULL, last_store))
673 res = false;
674 break;
677 /* Unset the visited flag. */
678 if (store)
679 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
680 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
682 return res;
685 /* Function vect_compute_data_ref_alignment
687 Compute the misalignment of the data reference DR.
689 Output:
690 1. If during the misalignment computation it is found that the data reference
691 cannot be vectorized then false is returned.
692 2. DR_MISALIGNMENT (DR) is defined.
694 FOR NOW: No analysis is actually performed. Misalignment is calculated
695 only for trivial cases. TODO. */
697 bool
698 vect_compute_data_ref_alignment (struct data_reference *dr)
700 gimple *stmt = DR_STMT (dr);
701 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
702 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
703 struct loop *loop = NULL;
704 tree ref = DR_REF (dr);
705 tree vectype;
706 tree base, base_addr;
707 tree misalign = NULL_TREE;
708 tree aligned_to;
709 tree step;
710 unsigned HOST_WIDE_INT alignment;
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE, vect_location,
714 "vect_compute_data_ref_alignment:\n");
716 if (loop_vinfo)
717 loop = LOOP_VINFO_LOOP (loop_vinfo);
719 /* Initialize misalignment to unknown. */
720 SET_DR_MISALIGNMENT (dr, -1);
722 if (tree_fits_shwi_p (DR_STEP (dr)))
723 misalign = DR_INIT (dr);
724 aligned_to = DR_ALIGNED_TO (dr);
725 base_addr = DR_BASE_ADDRESS (dr);
726 vectype = STMT_VINFO_VECTYPE (stmt_info);
728 /* In case the dataref is in an inner-loop of the loop that is being
729 vectorized (LOOP), we use the base and misalignment information
730 relative to the outer-loop (LOOP). This is ok only if the misalignment
731 stays the same throughout the execution of the inner-loop, which is why
732 we have to check that the stride of the dataref in the inner-loop evenly
733 divides by the vector size. */
734 if (loop && nested_in_vect_loop_p (loop, stmt))
736 tree step = DR_STEP (dr);
738 if (tree_fits_shwi_p (step)
739 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_NOTE, vect_location,
743 "inner step divides the vector-size.\n");
744 misalign = STMT_VINFO_DR_INIT (stmt_info);
745 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
746 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
748 else
750 if (dump_enabled_p ())
751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
752 "inner step doesn't divide the vector-size.\n");
753 misalign = NULL_TREE;
757 /* Similarly we can only use base and misalignment information relative to
758 an innermost loop if the misalignment stays the same throughout the
759 execution of the loop. As above, this is the case if the stride of
760 the dataref evenly divides by the vector size. */
761 else
763 tree step = DR_STEP (dr);
764 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
766 if (tree_fits_shwi_p (step)
767 && ((tree_to_shwi (step) * vf)
768 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
770 if (dump_enabled_p ())
771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
772 "step doesn't divide the vector-size.\n");
773 misalign = NULL_TREE;
777 /* To look at alignment of the base we have to preserve an inner MEM_REF
778 as that carries alignment information of the actual access. */
779 base = ref;
780 while (handled_component_p (base))
781 base = TREE_OPERAND (base, 0);
782 unsigned int base_alignment;
783 unsigned HOST_WIDE_INT base_bitpos;
784 get_object_alignment_1 (base, &base_alignment, &base_bitpos);
785 /* As data-ref analysis strips the MEM_REF down to its base operand
786 to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to
787 adjust things to make base_alignment valid as the alignment of
788 DR_BASE_ADDRESS. */
789 if (TREE_CODE (base) == MEM_REF)
791 base_bitpos -= mem_ref_offset (base).to_short_addr () * BITS_PER_UNIT;
792 base_bitpos &= (base_alignment - 1);
794 if (base_bitpos != 0)
795 base_alignment = base_bitpos & -base_bitpos;
796 /* Also look at the alignment of the base address DR analysis
797 computed. */
798 unsigned int base_addr_alignment = get_pointer_alignment (base_addr);
799 if (base_addr_alignment > base_alignment)
800 base_alignment = base_addr_alignment;
802 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
803 DR_VECT_AUX (dr)->base_element_aligned = true;
805 alignment = TYPE_ALIGN_UNIT (vectype);
807 if ((compare_tree_int (aligned_to, alignment) < 0)
808 || !misalign)
810 if (dump_enabled_p ())
812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
813 "Unknown alignment for access: ");
814 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
815 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
817 return true;
820 if (base_alignment < TYPE_ALIGN (vectype))
822 base = base_addr;
823 if (TREE_CODE (base) == ADDR_EXPR)
824 base = TREE_OPERAND (base, 0);
825 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
827 if (dump_enabled_p ())
829 dump_printf_loc (MSG_NOTE, vect_location,
830 "can't force alignment of ref: ");
831 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
832 dump_printf (MSG_NOTE, "\n");
834 return true;
837 if (DECL_USER_ALIGN (base))
839 if (dump_enabled_p ())
841 dump_printf_loc (MSG_NOTE, vect_location,
842 "not forcing alignment of user-aligned "
843 "variable: ");
844 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
845 dump_printf (MSG_NOTE, "\n");
847 return true;
850 /* Force the alignment of the decl.
851 NOTE: This is the only change to the code we make during
852 the analysis phase, before deciding to vectorize the loop. */
853 if (dump_enabled_p ())
855 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
856 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
857 dump_printf (MSG_NOTE, "\n");
860 DR_VECT_AUX (dr)->base_decl = base;
861 DR_VECT_AUX (dr)->base_misaligned = true;
862 DR_VECT_AUX (dr)->base_element_aligned = true;
865 if (loop && nested_in_vect_loop_p (loop, stmt))
866 step = STMT_VINFO_DR_STEP (stmt_info);
867 else
868 step = DR_STEP (dr);
869 /* If this is a backward running DR then first access in the larger
870 vectype actually is N-1 elements before the address in the DR.
871 Adjust misalign accordingly. */
872 if (tree_int_cst_sgn (step) < 0)
874 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
875 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
876 otherwise we wouldn't be here. */
877 offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
878 /* PLUS because STEP was negative. */
879 misalign = size_binop (PLUS_EXPR, misalign, offset);
882 SET_DR_MISALIGNMENT (dr,
883 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
885 if (dump_enabled_p ())
887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
888 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
889 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
890 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
893 return true;
897 /* Function vect_update_misalignment_for_peel
899 DR - the data reference whose misalignment is to be adjusted.
900 DR_PEEL - the data reference whose misalignment is being made
901 zero in the vector loop by the peel.
902 NPEEL - the number of iterations in the peel loop if the misalignment
903 of DR_PEEL is known at compile time. */
905 static void
906 vect_update_misalignment_for_peel (struct data_reference *dr,
907 struct data_reference *dr_peel, int npeel)
909 unsigned int i;
910 vec<dr_p> same_align_drs;
911 struct data_reference *current_dr;
912 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
913 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
914 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
915 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
917 /* For interleaved data accesses the step in the loop must be multiplied by
918 the size of the interleaving group. */
919 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
920 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
921 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
922 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
924 /* It can be assumed that the data refs with the same alignment as dr_peel
925 are aligned in the vector loop. */
926 same_align_drs
927 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
928 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
930 if (current_dr != dr)
931 continue;
932 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
933 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
934 SET_DR_MISALIGNMENT (dr, 0);
935 return;
938 if (known_alignment_for_access_p (dr)
939 && known_alignment_for_access_p (dr_peel))
941 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
942 int misal = DR_MISALIGNMENT (dr);
943 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
944 misal += negative ? -npeel * dr_size : npeel * dr_size;
945 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
946 SET_DR_MISALIGNMENT (dr, misal);
947 return;
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
952 SET_DR_MISALIGNMENT (dr, -1);
956 /* Function verify_data_ref_alignment
958 Return TRUE if DR can be handled with respect to alignment. */
960 static bool
961 verify_data_ref_alignment (data_reference_p dr)
963 enum dr_alignment_support supportable_dr_alignment
964 = vect_supportable_dr_alignment (dr, false);
965 if (!supportable_dr_alignment)
967 if (dump_enabled_p ())
969 if (DR_IS_READ (dr))
970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
971 "not vectorized: unsupported unaligned load.");
972 else
973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
974 "not vectorized: unsupported unaligned "
975 "store.");
977 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
978 DR_REF (dr));
979 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
981 return false;
984 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
985 dump_printf_loc (MSG_NOTE, vect_location,
986 "Vectorizing an unaligned access.\n");
988 return true;
991 /* Function vect_verify_datarefs_alignment
993 Return TRUE if all data references in the loop can be
994 handled with respect to alignment. */
996 bool
997 vect_verify_datarefs_alignment (loop_vec_info vinfo)
999 vec<data_reference_p> datarefs = vinfo->datarefs;
1000 struct data_reference *dr;
1001 unsigned int i;
1003 FOR_EACH_VEC_ELT (datarefs, i, dr)
1005 gimple *stmt = DR_STMT (dr);
1006 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1008 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1009 continue;
1011 /* For interleaving, only the alignment of the first access matters. */
1012 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1013 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1014 continue;
1016 /* Strided accesses perform only component accesses, alignment is
1017 irrelevant for them. */
1018 if (STMT_VINFO_STRIDED_P (stmt_info)
1019 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1020 continue;
1022 if (! verify_data_ref_alignment (dr))
1023 return false;
1026 return true;
1029 /* Given an memory reference EXP return whether its alignment is less
1030 than its size. */
1032 static bool
1033 not_size_aligned (tree exp)
1035 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1036 return true;
1038 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1039 > get_object_alignment (exp));
1042 /* Function vector_alignment_reachable_p
1044 Return true if vector alignment for DR is reachable by peeling
1045 a few loop iterations. Return false otherwise. */
1047 static bool
1048 vector_alignment_reachable_p (struct data_reference *dr)
1050 gimple *stmt = DR_STMT (dr);
1051 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1052 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1054 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1056 /* For interleaved access we peel only if number of iterations in
1057 the prolog loop ({VF - misalignment}), is a multiple of the
1058 number of the interleaved accesses. */
1059 int elem_size, mis_in_elements;
1060 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1062 /* FORNOW: handle only known alignment. */
1063 if (!known_alignment_for_access_p (dr))
1064 return false;
1066 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1067 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1069 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1070 return false;
1073 /* If misalignment is known at the compile time then allow peeling
1074 only if natural alignment is reachable through peeling. */
1075 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1077 HOST_WIDE_INT elmsize =
1078 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1079 if (dump_enabled_p ())
1081 dump_printf_loc (MSG_NOTE, vect_location,
1082 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1083 dump_printf (MSG_NOTE,
1084 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1086 if (DR_MISALIGNMENT (dr) % elmsize)
1088 if (dump_enabled_p ())
1089 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1090 "data size does not divide the misalignment.\n");
1091 return false;
1095 if (!known_alignment_for_access_p (dr))
1097 tree type = TREE_TYPE (DR_REF (dr));
1098 bool is_packed = not_size_aligned (DR_REF (dr));
1099 if (dump_enabled_p ())
1100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1101 "Unknown misalignment, %snaturally aligned\n",
1102 is_packed ? "not " : "");
1103 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1106 return true;
1110 /* Calculate the cost of the memory access represented by DR. */
1112 static void
1113 vect_get_data_access_cost (struct data_reference *dr,
1114 unsigned int *inside_cost,
1115 unsigned int *outside_cost,
1116 stmt_vector_for_cost *body_cost_vec)
1118 gimple *stmt = DR_STMT (dr);
1119 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1120 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1121 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1122 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1123 int ncopies = vf / nunits;
1125 if (DR_IS_READ (dr))
1126 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1127 NULL, body_cost_vec, false);
1128 else
1129 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1131 if (dump_enabled_p ())
1132 dump_printf_loc (MSG_NOTE, vect_location,
1133 "vect_get_data_access_cost: inside_cost = %d, "
1134 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1138 typedef struct _vect_peel_info
1140 int npeel;
1141 struct data_reference *dr;
1142 unsigned int count;
1143 } *vect_peel_info;
1145 typedef struct _vect_peel_extended_info
1147 struct _vect_peel_info peel_info;
1148 unsigned int inside_cost;
1149 unsigned int outside_cost;
1150 stmt_vector_for_cost body_cost_vec;
1151 } *vect_peel_extended_info;
1154 /* Peeling hashtable helpers. */
1156 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1158 static inline hashval_t hash (const _vect_peel_info *);
1159 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1162 inline hashval_t
1163 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1165 return (hashval_t) peel_info->npeel;
1168 inline bool
1169 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1171 return (a->npeel == b->npeel);
1175 /* Insert DR into peeling hash table with NPEEL as key. */
1177 static void
1178 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1179 loop_vec_info loop_vinfo, struct data_reference *dr,
1180 int npeel)
1182 struct _vect_peel_info elem, *slot;
1183 _vect_peel_info **new_slot;
1184 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1186 elem.npeel = npeel;
1187 slot = peeling_htab->find (&elem);
1188 if (slot)
1189 slot->count++;
1190 else
1192 slot = XNEW (struct _vect_peel_info);
1193 slot->npeel = npeel;
1194 slot->dr = dr;
1195 slot->count = 1;
1196 new_slot = peeling_htab->find_slot (slot, INSERT);
1197 *new_slot = slot;
1200 if (!supportable_dr_alignment
1201 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1202 slot->count += VECT_MAX_COST;
1206 /* Traverse peeling hash table to find peeling option that aligns maximum
1207 number of data accesses. */
1210 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1211 _vect_peel_extended_info *max)
1213 vect_peel_info elem = *slot;
1215 if (elem->count > max->peel_info.count
1216 || (elem->count == max->peel_info.count
1217 && max->peel_info.npeel > elem->npeel))
1219 max->peel_info.npeel = elem->npeel;
1220 max->peel_info.count = elem->count;
1221 max->peel_info.dr = elem->dr;
1224 return 1;
1228 /* Traverse peeling hash table and calculate cost for each peeling option.
1229 Find the one with the lowest cost. */
1232 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1233 _vect_peel_extended_info *min)
1235 vect_peel_info elem = *slot;
1236 int save_misalignment, dummy;
1237 unsigned int inside_cost = 0, outside_cost = 0, i;
1238 gimple *stmt = DR_STMT (elem->dr);
1239 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1240 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1241 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1242 struct data_reference *dr;
1243 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1245 prologue_cost_vec.create (2);
1246 body_cost_vec.create (2);
1247 epilogue_cost_vec.create (2);
1249 FOR_EACH_VEC_ELT (datarefs, i, dr)
1251 stmt = DR_STMT (dr);
1252 stmt_info = vinfo_for_stmt (stmt);
1253 /* For interleaving, only the alignment of the first access
1254 matters. */
1255 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1256 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1257 continue;
1259 /* Strided accesses perform only component accesses, alignment is
1260 irrelevant for them. */
1261 if (STMT_VINFO_STRIDED_P (stmt_info)
1262 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1263 continue;
1265 save_misalignment = DR_MISALIGNMENT (dr);
1266 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1267 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1268 &body_cost_vec);
1269 SET_DR_MISALIGNMENT (dr, save_misalignment);
1272 outside_cost += vect_get_known_peeling_cost
1273 (loop_vinfo, elem->npeel, &dummy,
1274 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1275 &prologue_cost_vec, &epilogue_cost_vec);
1277 /* Prologue and epilogue costs are added to the target model later.
1278 These costs depend only on the scalar iteration cost, the
1279 number of peeling iterations finally chosen, and the number of
1280 misaligned statements. So discard the information found here. */
1281 prologue_cost_vec.release ();
1282 epilogue_cost_vec.release ();
1284 if (inside_cost < min->inside_cost
1285 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1287 min->inside_cost = inside_cost;
1288 min->outside_cost = outside_cost;
1289 min->body_cost_vec.release ();
1290 min->body_cost_vec = body_cost_vec;
1291 min->peel_info.dr = elem->dr;
1292 min->peel_info.npeel = elem->npeel;
1294 else
1295 body_cost_vec.release ();
1297 return 1;
1301 /* Choose best peeling option by traversing peeling hash table and either
1302 choosing an option with the lowest cost (if cost model is enabled) or the
1303 option that aligns as many accesses as possible. */
1305 static struct data_reference *
1306 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1307 loop_vec_info loop_vinfo,
1308 unsigned int *npeel,
1309 stmt_vector_for_cost *body_cost_vec)
1311 struct _vect_peel_extended_info res;
1313 res.peel_info.dr = NULL;
1314 res.body_cost_vec = stmt_vector_for_cost ();
1316 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1318 res.inside_cost = INT_MAX;
1319 res.outside_cost = INT_MAX;
1320 peeling_htab->traverse <_vect_peel_extended_info *,
1321 vect_peeling_hash_get_lowest_cost> (&res);
1323 else
1325 res.peel_info.count = 0;
1326 peeling_htab->traverse <_vect_peel_extended_info *,
1327 vect_peeling_hash_get_most_frequent> (&res);
1330 *npeel = res.peel_info.npeel;
1331 *body_cost_vec = res.body_cost_vec;
1332 return res.peel_info.dr;
1336 /* Function vect_enhance_data_refs_alignment
1338 This pass will use loop versioning and loop peeling in order to enhance
1339 the alignment of data references in the loop.
1341 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1342 original loop is to be vectorized. Any other loops that are created by
1343 the transformations performed in this pass - are not supposed to be
1344 vectorized. This restriction will be relaxed.
1346 This pass will require a cost model to guide it whether to apply peeling
1347 or versioning or a combination of the two. For example, the scheme that
1348 intel uses when given a loop with several memory accesses, is as follows:
1349 choose one memory access ('p') which alignment you want to force by doing
1350 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1351 other accesses are not necessarily aligned, or (2) use loop versioning to
1352 generate one loop in which all accesses are aligned, and another loop in
1353 which only 'p' is necessarily aligned.
1355 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1356 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1357 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1359 Devising a cost model is the most critical aspect of this work. It will
1360 guide us on which access to peel for, whether to use loop versioning, how
1361 many versions to create, etc. The cost model will probably consist of
1362 generic considerations as well as target specific considerations (on
1363 powerpc for example, misaligned stores are more painful than misaligned
1364 loads).
1366 Here are the general steps involved in alignment enhancements:
1368 -- original loop, before alignment analysis:
1369 for (i=0; i<N; i++){
1370 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1371 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1374 -- After vect_compute_data_refs_alignment:
1375 for (i=0; i<N; i++){
1376 x = q[i]; # DR_MISALIGNMENT(q) = 3
1377 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1380 -- Possibility 1: we do loop versioning:
1381 if (p is aligned) {
1382 for (i=0; i<N; i++){ # loop 1A
1383 x = q[i]; # DR_MISALIGNMENT(q) = 3
1384 p[i] = y; # DR_MISALIGNMENT(p) = 0
1387 else {
1388 for (i=0; i<N; i++){ # loop 1B
1389 x = q[i]; # DR_MISALIGNMENT(q) = 3
1390 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1394 -- Possibility 2: we do loop peeling:
1395 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1396 x = q[i];
1397 p[i] = y;
1399 for (i = 3; i < N; i++){ # loop 2A
1400 x = q[i]; # DR_MISALIGNMENT(q) = 0
1401 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1404 -- Possibility 3: combination of loop peeling and versioning:
1405 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1406 x = q[i];
1407 p[i] = y;
1409 if (p is aligned) {
1410 for (i = 3; i<N; i++){ # loop 3A
1411 x = q[i]; # DR_MISALIGNMENT(q) = 0
1412 p[i] = y; # DR_MISALIGNMENT(p) = 0
1415 else {
1416 for (i = 3; i<N; i++){ # loop 3B
1417 x = q[i]; # DR_MISALIGNMENT(q) = 0
1418 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1422 These loops are later passed to loop_transform to be vectorized. The
1423 vectorizer will use the alignment information to guide the transformation
1424 (whether to generate regular loads/stores, or with special handling for
1425 misalignment). */
1427 bool
1428 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1430 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1431 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1432 enum dr_alignment_support supportable_dr_alignment;
1433 struct data_reference *dr0 = NULL, *first_store = NULL;
1434 struct data_reference *dr;
1435 unsigned int i, j;
1436 bool do_peeling = false;
1437 bool do_versioning = false;
1438 bool stat;
1439 gimple *stmt;
1440 stmt_vec_info stmt_info;
1441 unsigned int npeel = 0;
1442 bool all_misalignments_unknown = true;
1443 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1444 unsigned possible_npeel_number = 1;
1445 tree vectype;
1446 unsigned int nelements, mis, same_align_drs_max = 0;
1447 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1448 hash_table<peel_info_hasher> peeling_htab (1);
1450 if (dump_enabled_p ())
1451 dump_printf_loc (MSG_NOTE, vect_location,
1452 "=== vect_enhance_data_refs_alignment ===\n");
1454 /* Reset data so we can safely be called multiple times. */
1455 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1456 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1458 /* While cost model enhancements are expected in the future, the high level
1459 view of the code at this time is as follows:
1461 A) If there is a misaligned access then see if peeling to align
1462 this access can make all data references satisfy
1463 vect_supportable_dr_alignment. If so, update data structures
1464 as needed and return true.
1466 B) If peeling wasn't possible and there is a data reference with an
1467 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1468 then see if loop versioning checks can be used to make all data
1469 references satisfy vect_supportable_dr_alignment. If so, update
1470 data structures as needed and return true.
1472 C) If neither peeling nor versioning were successful then return false if
1473 any data reference does not satisfy vect_supportable_dr_alignment.
1475 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1477 Note, Possibility 3 above (which is peeling and versioning together) is not
1478 being done at this time. */
1480 /* (1) Peeling to force alignment. */
1482 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1483 Considerations:
1484 + How many accesses will become aligned due to the peeling
1485 - How many accesses will become unaligned due to the peeling,
1486 and the cost of misaligned accesses.
1487 - The cost of peeling (the extra runtime checks, the increase
1488 in code size). */
1490 FOR_EACH_VEC_ELT (datarefs, i, dr)
1492 stmt = DR_STMT (dr);
1493 stmt_info = vinfo_for_stmt (stmt);
1495 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1496 continue;
1498 /* For interleaving, only the alignment of the first access
1499 matters. */
1500 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1501 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1502 continue;
1504 /* For invariant accesses there is nothing to enhance. */
1505 if (integer_zerop (DR_STEP (dr)))
1506 continue;
1508 /* Strided accesses perform only component accesses, alignment is
1509 irrelevant for them. */
1510 if (STMT_VINFO_STRIDED_P (stmt_info)
1511 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1512 continue;
1514 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1515 do_peeling = vector_alignment_reachable_p (dr);
1516 if (do_peeling)
1518 if (known_alignment_for_access_p (dr))
1520 unsigned int npeel_tmp;
1521 bool negative = tree_int_cst_compare (DR_STEP (dr),
1522 size_zero_node) < 0;
1524 /* Save info about DR in the hash table. */
1525 vectype = STMT_VINFO_VECTYPE (stmt_info);
1526 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1527 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1528 TREE_TYPE (DR_REF (dr))));
1529 npeel_tmp = (negative
1530 ? (mis - nelements) : (nelements - mis))
1531 & (nelements - 1);
1533 /* For multiple types, it is possible that the bigger type access
1534 will have more than one peeling option. E.g., a loop with two
1535 types: one of size (vector size / 4), and the other one of
1536 size (vector size / 8). Vectorization factor will 8. If both
1537 access are misaligned by 3, the first one needs one scalar
1538 iteration to be aligned, and the second one needs 5. But the
1539 first one will be aligned also by peeling 5 scalar
1540 iterations, and in that case both accesses will be aligned.
1541 Hence, except for the immediate peeling amount, we also want
1542 to try to add full vector size, while we don't exceed
1543 vectorization factor.
1544 We do this automtically for cost model, since we calculate cost
1545 for every peeling option. */
1546 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1548 if (STMT_SLP_TYPE (stmt_info))
1549 possible_npeel_number
1550 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1551 else
1552 possible_npeel_number = vf / nelements;
1555 /* Handle the aligned case. We may decide to align some other
1556 access, making DR unaligned. */
1557 if (DR_MISALIGNMENT (dr) == 0)
1559 npeel_tmp = 0;
1560 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1561 possible_npeel_number++;
1564 for (j = 0; j < possible_npeel_number; j++)
1566 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1567 dr, npeel_tmp);
1568 npeel_tmp += nelements;
1571 all_misalignments_unknown = false;
1572 /* Data-ref that was chosen for the case that all the
1573 misalignments are unknown is not relevant anymore, since we
1574 have a data-ref with known alignment. */
1575 dr0 = NULL;
1577 else
1579 /* If we don't know any misalignment values, we prefer
1580 peeling for data-ref that has the maximum number of data-refs
1581 with the same alignment, unless the target prefers to align
1582 stores over load. */
1583 if (all_misalignments_unknown)
1585 unsigned same_align_drs
1586 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1587 if (!dr0
1588 || same_align_drs_max < same_align_drs)
1590 same_align_drs_max = same_align_drs;
1591 dr0 = dr;
1593 /* For data-refs with the same number of related
1594 accesses prefer the one where the misalign
1595 computation will be invariant in the outermost loop. */
1596 else if (same_align_drs_max == same_align_drs)
1598 struct loop *ivloop0, *ivloop;
1599 ivloop0 = outermost_invariant_loop_for_expr
1600 (loop, DR_BASE_ADDRESS (dr0));
1601 ivloop = outermost_invariant_loop_for_expr
1602 (loop, DR_BASE_ADDRESS (dr));
1603 if ((ivloop && !ivloop0)
1604 || (ivloop && ivloop0
1605 && flow_loop_nested_p (ivloop, ivloop0)))
1606 dr0 = dr;
1609 if (!first_store && DR_IS_WRITE (dr))
1610 first_store = dr;
1613 /* If there are both known and unknown misaligned accesses in the
1614 loop, we choose peeling amount according to the known
1615 accesses. */
1616 if (!supportable_dr_alignment)
1618 dr0 = dr;
1619 if (!first_store && DR_IS_WRITE (dr))
1620 first_store = dr;
1624 else
1626 if (!aligned_access_p (dr))
1628 if (dump_enabled_p ())
1629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1630 "vector alignment may not be reachable\n");
1631 break;
1636 /* Check if we can possibly peel the loop. */
1637 if (!vect_can_advance_ivs_p (loop_vinfo)
1638 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1639 || loop->inner)
1640 do_peeling = false;
1642 if (do_peeling
1643 && all_misalignments_unknown
1644 && vect_supportable_dr_alignment (dr0, false))
1646 /* Check if the target requires to prefer stores over loads, i.e., if
1647 misaligned stores are more expensive than misaligned loads (taking
1648 drs with same alignment into account). */
1649 if (first_store && DR_IS_READ (dr0))
1651 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1652 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1653 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1654 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1655 stmt_vector_for_cost dummy;
1656 dummy.create (2);
1658 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1659 &dummy);
1660 vect_get_data_access_cost (first_store, &store_inside_cost,
1661 &store_outside_cost, &dummy);
1663 dummy.release ();
1665 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1666 aligning the load DR0). */
1667 load_inside_penalty = store_inside_cost;
1668 load_outside_penalty = store_outside_cost;
1669 for (i = 0;
1670 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1671 DR_STMT (first_store))).iterate (i, &dr);
1672 i++)
1673 if (DR_IS_READ (dr))
1675 load_inside_penalty += load_inside_cost;
1676 load_outside_penalty += load_outside_cost;
1678 else
1680 load_inside_penalty += store_inside_cost;
1681 load_outside_penalty += store_outside_cost;
1684 /* Calculate the penalty for leaving DR0 unaligned (by
1685 aligning the FIRST_STORE). */
1686 store_inside_penalty = load_inside_cost;
1687 store_outside_penalty = load_outside_cost;
1688 for (i = 0;
1689 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1690 DR_STMT (dr0))).iterate (i, &dr);
1691 i++)
1692 if (DR_IS_READ (dr))
1694 store_inside_penalty += load_inside_cost;
1695 store_outside_penalty += load_outside_cost;
1697 else
1699 store_inside_penalty += store_inside_cost;
1700 store_outside_penalty += store_outside_cost;
1703 if (load_inside_penalty > store_inside_penalty
1704 || (load_inside_penalty == store_inside_penalty
1705 && load_outside_penalty > store_outside_penalty))
1706 dr0 = first_store;
1709 /* In case there are only loads with different unknown misalignments, use
1710 peeling only if it may help to align other accesses in the loop or
1711 if it may help improving load bandwith when we'd end up using
1712 unaligned loads. */
1713 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1714 if (!first_store
1715 && !STMT_VINFO_SAME_ALIGN_REFS (
1716 vinfo_for_stmt (DR_STMT (dr0))).length ()
1717 && (vect_supportable_dr_alignment (dr0, false)
1718 != dr_unaligned_supported
1719 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1720 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1721 do_peeling = false;
1724 if (do_peeling && !dr0)
1726 /* Peeling is possible, but there is no data access that is not supported
1727 unless aligned. So we try to choose the best possible peeling. */
1729 /* We should get here only if there are drs with known misalignment. */
1730 gcc_assert (!all_misalignments_unknown);
1732 /* Choose the best peeling from the hash table. */
1733 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1734 loop_vinfo, &npeel,
1735 &body_cost_vec);
1736 if (!dr0 || !npeel)
1737 do_peeling = false;
1740 if (do_peeling)
1742 stmt = DR_STMT (dr0);
1743 stmt_info = vinfo_for_stmt (stmt);
1744 vectype = STMT_VINFO_VECTYPE (stmt_info);
1745 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1747 if (known_alignment_for_access_p (dr0))
1749 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1750 size_zero_node) < 0;
1751 if (!npeel)
1753 /* Since it's known at compile time, compute the number of
1754 iterations in the peeled loop (the peeling factor) for use in
1755 updating DR_MISALIGNMENT values. The peeling factor is the
1756 vectorization factor minus the misalignment as an element
1757 count. */
1758 mis = DR_MISALIGNMENT (dr0);
1759 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1760 npeel = ((negative ? mis - nelements : nelements - mis)
1761 & (nelements - 1));
1764 /* For interleaved data access every iteration accesses all the
1765 members of the group, therefore we divide the number of iterations
1766 by the group size. */
1767 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1768 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1769 npeel /= GROUP_SIZE (stmt_info);
1771 if (dump_enabled_p ())
1772 dump_printf_loc (MSG_NOTE, vect_location,
1773 "Try peeling by %d\n", npeel);
1776 /* Ensure that all data refs can be vectorized after the peel. */
1777 FOR_EACH_VEC_ELT (datarefs, i, dr)
1779 int save_misalignment;
1781 if (dr == dr0)
1782 continue;
1784 stmt = DR_STMT (dr);
1785 stmt_info = vinfo_for_stmt (stmt);
1786 /* For interleaving, only the alignment of the first access
1787 matters. */
1788 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1789 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1790 continue;
1792 /* Strided accesses perform only component accesses, alignment is
1793 irrelevant for them. */
1794 if (STMT_VINFO_STRIDED_P (stmt_info)
1795 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1796 continue;
1798 save_misalignment = DR_MISALIGNMENT (dr);
1799 vect_update_misalignment_for_peel (dr, dr0, npeel);
1800 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1801 SET_DR_MISALIGNMENT (dr, save_misalignment);
1803 if (!supportable_dr_alignment)
1805 do_peeling = false;
1806 break;
1810 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1812 stat = vect_verify_datarefs_alignment (loop_vinfo);
1813 if (!stat)
1814 do_peeling = false;
1815 else
1817 body_cost_vec.release ();
1818 return stat;
1822 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1823 if (do_peeling)
1825 unsigned max_allowed_peel
1826 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1827 if (max_allowed_peel != (unsigned)-1)
1829 unsigned max_peel = npeel;
1830 if (max_peel == 0)
1832 gimple *dr_stmt = DR_STMT (dr0);
1833 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1834 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1835 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1837 if (max_peel > max_allowed_peel)
1839 do_peeling = false;
1840 if (dump_enabled_p ())
1841 dump_printf_loc (MSG_NOTE, vect_location,
1842 "Disable peeling, max peels reached: %d\n", max_peel);
1847 /* Cost model #2 - if peeling may result in a remaining loop not
1848 iterating enough to be vectorized then do not peel. */
1849 if (do_peeling
1850 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1852 unsigned max_peel
1853 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1854 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1855 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1856 do_peeling = false;
1859 if (do_peeling)
1861 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1862 If the misalignment of DR_i is identical to that of dr0 then set
1863 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1864 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1865 by the peeling factor times the element size of DR_i (MOD the
1866 vectorization factor times the size). Otherwise, the
1867 misalignment of DR_i must be set to unknown. */
1868 FOR_EACH_VEC_ELT (datarefs, i, dr)
1869 if (dr != dr0)
1871 /* Strided accesses perform only component accesses, alignment
1872 is irrelevant for them. */
1873 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1874 if (STMT_VINFO_STRIDED_P (stmt_info)
1875 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1876 continue;
1878 vect_update_misalignment_for_peel (dr, dr0, npeel);
1881 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1882 if (npeel)
1883 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1884 else
1885 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1886 = DR_MISALIGNMENT (dr0);
1887 SET_DR_MISALIGNMENT (dr0, 0);
1888 if (dump_enabled_p ())
1890 dump_printf_loc (MSG_NOTE, vect_location,
1891 "Alignment of access forced using peeling.\n");
1892 dump_printf_loc (MSG_NOTE, vect_location,
1893 "Peeling for alignment will be applied.\n");
1895 /* The inside-loop cost will be accounted for in vectorizable_load
1896 and vectorizable_store correctly with adjusted alignments.
1897 Drop the body_cst_vec on the floor here. */
1898 body_cost_vec.release ();
1900 stat = vect_verify_datarefs_alignment (loop_vinfo);
1901 gcc_assert (stat);
1902 return stat;
1906 body_cost_vec.release ();
1908 /* (2) Versioning to force alignment. */
1910 /* Try versioning if:
1911 1) optimize loop for speed
1912 2) there is at least one unsupported misaligned data ref with an unknown
1913 misalignment, and
1914 3) all misaligned data refs with a known misalignment are supported, and
1915 4) the number of runtime alignment checks is within reason. */
1917 do_versioning =
1918 optimize_loop_nest_for_speed_p (loop)
1919 && (!loop->inner); /* FORNOW */
1921 if (do_versioning)
1923 FOR_EACH_VEC_ELT (datarefs, i, dr)
1925 stmt = DR_STMT (dr);
1926 stmt_info = vinfo_for_stmt (stmt);
1928 /* For interleaving, only the alignment of the first access
1929 matters. */
1930 if (aligned_access_p (dr)
1931 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1932 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1933 continue;
1935 if (STMT_VINFO_STRIDED_P (stmt_info))
1937 /* Strided loads perform only component accesses, alignment is
1938 irrelevant for them. */
1939 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1940 continue;
1941 do_versioning = false;
1942 break;
1945 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1947 if (!supportable_dr_alignment)
1949 gimple *stmt;
1950 int mask;
1951 tree vectype;
1953 if (known_alignment_for_access_p (dr)
1954 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1955 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1957 do_versioning = false;
1958 break;
1961 stmt = DR_STMT (dr);
1962 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1963 gcc_assert (vectype);
1965 /* The rightmost bits of an aligned address must be zeros.
1966 Construct the mask needed for this test. For example,
1967 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1968 mask must be 15 = 0xf. */
1969 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1971 /* FORNOW: use the same mask to test all potentially unaligned
1972 references in the loop. The vectorizer currently supports
1973 a single vector size, see the reference to
1974 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1975 vectorization factor is computed. */
1976 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1977 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1978 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1979 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1980 DR_STMT (dr));
1984 /* Versioning requires at least one misaligned data reference. */
1985 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1986 do_versioning = false;
1987 else if (!do_versioning)
1988 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1991 if (do_versioning)
1993 vec<gimple *> may_misalign_stmts
1994 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1995 gimple *stmt;
1997 /* It can now be assumed that the data references in the statements
1998 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1999 of the loop being vectorized. */
2000 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2002 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2003 dr = STMT_VINFO_DATA_REF (stmt_info);
2004 SET_DR_MISALIGNMENT (dr, 0);
2005 if (dump_enabled_p ())
2006 dump_printf_loc (MSG_NOTE, vect_location,
2007 "Alignment of access forced using versioning.\n");
2010 if (dump_enabled_p ())
2011 dump_printf_loc (MSG_NOTE, vect_location,
2012 "Versioning for alignment will be applied.\n");
2014 /* Peeling and versioning can't be done together at this time. */
2015 gcc_assert (! (do_peeling && do_versioning));
2017 stat = vect_verify_datarefs_alignment (loop_vinfo);
2018 gcc_assert (stat);
2019 return stat;
2022 /* This point is reached if neither peeling nor versioning is being done. */
2023 gcc_assert (! (do_peeling || do_versioning));
2025 stat = vect_verify_datarefs_alignment (loop_vinfo);
2026 return stat;
2030 /* Function vect_find_same_alignment_drs.
2032 Update group and alignment relations according to the chosen
2033 vectorization factor. */
2035 static void
2036 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2037 loop_vec_info loop_vinfo)
2039 unsigned int i;
2040 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2041 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2042 struct data_reference *dra = DDR_A (ddr);
2043 struct data_reference *drb = DDR_B (ddr);
2044 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2045 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2046 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2047 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2048 lambda_vector dist_v;
2049 unsigned int loop_depth;
2051 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2052 return;
2054 if (dra == drb)
2055 return;
2057 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2058 return;
2060 /* Loop-based vectorization and known data dependence. */
2061 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2062 return;
2064 /* Data-dependence analysis reports a distance vector of zero
2065 for data-references that overlap only in the first iteration
2066 but have different sign step (see PR45764).
2067 So as a sanity check require equal DR_STEP. */
2068 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2069 return;
2071 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2072 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2074 int dist = dist_v[loop_depth];
2076 if (dump_enabled_p ())
2077 dump_printf_loc (MSG_NOTE, vect_location,
2078 "dependence distance = %d.\n", dist);
2080 /* Same loop iteration. */
2081 if (dist == 0
2082 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2084 /* Two references with distance zero have the same alignment. */
2085 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2086 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2087 if (dump_enabled_p ())
2089 dump_printf_loc (MSG_NOTE, vect_location,
2090 "accesses have the same alignment.\n");
2091 dump_printf (MSG_NOTE,
2092 "dependence distance modulo vf == 0 between ");
2093 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2094 dump_printf (MSG_NOTE, " and ");
2095 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2096 dump_printf (MSG_NOTE, "\n");
2103 /* Function vect_analyze_data_refs_alignment
2105 Analyze the alignment of the data-references in the loop.
2106 Return FALSE if a data reference is found that cannot be vectorized. */
2108 bool
2109 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2111 if (dump_enabled_p ())
2112 dump_printf_loc (MSG_NOTE, vect_location,
2113 "=== vect_analyze_data_refs_alignment ===\n");
2115 /* Mark groups of data references with same alignment using
2116 data dependence information. */
2117 vec<ddr_p> ddrs = vinfo->ddrs;
2118 struct data_dependence_relation *ddr;
2119 unsigned int i;
2121 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2122 vect_find_same_alignment_drs (ddr, vinfo);
2124 vec<data_reference_p> datarefs = vinfo->datarefs;
2125 struct data_reference *dr;
2127 FOR_EACH_VEC_ELT (datarefs, i, dr)
2129 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2130 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2131 && !vect_compute_data_ref_alignment (dr))
2133 /* Strided accesses perform only component accesses, misalignment
2134 information is irrelevant for them. */
2135 if (STMT_VINFO_STRIDED_P (stmt_info)
2136 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2137 continue;
2139 if (dump_enabled_p ())
2140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2141 "not vectorized: can't calculate alignment "
2142 "for data ref.\n");
2144 return false;
2148 return true;
2152 /* Analyze alignment of DRs of stmts in NODE. */
2154 static bool
2155 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2157 /* We vectorize from the first scalar stmt in the node unless
2158 the node is permuted in which case we start from the first
2159 element in the group. */
2160 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2161 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2162 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2163 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2165 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2166 if (! vect_compute_data_ref_alignment (dr)
2167 /* For creating the data-ref pointer we need alignment of the
2168 first element anyway. */
2169 || (dr != first_dr
2170 && ! vect_compute_data_ref_alignment (first_dr))
2171 || ! verify_data_ref_alignment (dr))
2173 if (dump_enabled_p ())
2174 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2175 "not vectorized: bad data alignment in basic "
2176 "block.\n");
2177 return false;
2180 return true;
2183 /* Function vect_slp_analyze_instance_alignment
2185 Analyze the alignment of the data-references in the SLP instance.
2186 Return FALSE if a data reference is found that cannot be vectorized. */
2188 bool
2189 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2191 if (dump_enabled_p ())
2192 dump_printf_loc (MSG_NOTE, vect_location,
2193 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2195 slp_tree node;
2196 unsigned i;
2197 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2198 if (! vect_slp_analyze_and_verify_node_alignment (node))
2199 return false;
2201 node = SLP_INSTANCE_TREE (instance);
2202 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2203 && ! vect_slp_analyze_and_verify_node_alignment
2204 (SLP_INSTANCE_TREE (instance)))
2205 return false;
2207 return true;
2211 /* Analyze groups of accesses: check that DR belongs to a group of
2212 accesses of legal size, step, etc. Detect gaps, single element
2213 interleaving, and other special cases. Set grouped access info.
2214 Collect groups of strided stores for further use in SLP analysis.
2215 Worker for vect_analyze_group_access. */
2217 static bool
2218 vect_analyze_group_access_1 (struct data_reference *dr)
2220 tree step = DR_STEP (dr);
2221 tree scalar_type = TREE_TYPE (DR_REF (dr));
2222 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2223 gimple *stmt = DR_STMT (dr);
2224 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2225 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2226 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2227 HOST_WIDE_INT dr_step = -1;
2228 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2229 bool slp_impossible = false;
2231 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2232 size of the interleaving group (including gaps). */
2233 if (tree_fits_shwi_p (step))
2235 dr_step = tree_to_shwi (step);
2236 /* Check that STEP is a multiple of type size. Otherwise there is
2237 a non-element-sized gap at the end of the group which we
2238 cannot represent in GROUP_GAP or GROUP_SIZE.
2239 ??? As we can handle non-constant step fine here we should
2240 simply remove uses of GROUP_GAP between the last and first
2241 element and instead rely on DR_STEP. GROUP_SIZE then would
2242 simply not include that gap. */
2243 if ((dr_step % type_size) != 0)
2245 if (dump_enabled_p ())
2247 dump_printf_loc (MSG_NOTE, vect_location,
2248 "Step ");
2249 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2250 dump_printf (MSG_NOTE,
2251 " is not a multiple of the element size for ");
2252 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2253 dump_printf (MSG_NOTE, "\n");
2255 return false;
2257 groupsize = absu_hwi (dr_step) / type_size;
2259 else
2260 groupsize = 0;
2262 /* Not consecutive access is possible only if it is a part of interleaving. */
2263 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2265 /* Check if it this DR is a part of interleaving, and is a single
2266 element of the group that is accessed in the loop. */
2268 /* Gaps are supported only for loads. STEP must be a multiple of the type
2269 size. The size of the group must be a power of 2. */
2270 if (DR_IS_READ (dr)
2271 && (dr_step % type_size) == 0
2272 && groupsize > 0
2273 && pow2p_hwi (groupsize))
2275 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2276 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2277 GROUP_GAP (stmt_info) = groupsize - 1;
2278 if (dump_enabled_p ())
2280 dump_printf_loc (MSG_NOTE, vect_location,
2281 "Detected single element interleaving ");
2282 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2283 dump_printf (MSG_NOTE, " step ");
2284 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2285 dump_printf (MSG_NOTE, "\n");
2288 return true;
2291 if (dump_enabled_p ())
2293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2294 "not consecutive access ");
2295 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2298 if (bb_vinfo)
2300 /* Mark the statement as unvectorizable. */
2301 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2302 return true;
2305 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2306 STMT_VINFO_STRIDED_P (stmt_info) = true;
2307 return true;
2310 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2312 /* First stmt in the interleaving chain. Check the chain. */
2313 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2314 struct data_reference *data_ref = dr;
2315 unsigned int count = 1;
2316 tree prev_init = DR_INIT (data_ref);
2317 gimple *prev = stmt;
2318 HOST_WIDE_INT diff, gaps = 0;
2320 while (next)
2322 /* Skip same data-refs. In case that two or more stmts share
2323 data-ref (supported only for loads), we vectorize only the first
2324 stmt, and the rest get their vectorized loads from the first
2325 one. */
2326 if (!tree_int_cst_compare (DR_INIT (data_ref),
2327 DR_INIT (STMT_VINFO_DATA_REF (
2328 vinfo_for_stmt (next)))))
2330 if (DR_IS_WRITE (data_ref))
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334 "Two store stmts share the same dr.\n");
2335 return false;
2338 if (dump_enabled_p ())
2339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2340 "Two or more load stmts share the same dr.\n");
2342 /* For load use the same data-ref load. */
2343 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2345 prev = next;
2346 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2347 continue;
2350 prev = next;
2351 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2353 /* All group members have the same STEP by construction. */
2354 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2356 /* Check that the distance between two accesses is equal to the type
2357 size. Otherwise, we have gaps. */
2358 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2359 - TREE_INT_CST_LOW (prev_init)) / type_size;
2360 if (diff != 1)
2362 /* FORNOW: SLP of accesses with gaps is not supported. */
2363 slp_impossible = true;
2364 if (DR_IS_WRITE (data_ref))
2366 if (dump_enabled_p ())
2367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2368 "interleaved store with gaps\n");
2369 return false;
2372 gaps += diff - 1;
2375 last_accessed_element += diff;
2377 /* Store the gap from the previous member of the group. If there is no
2378 gap in the access, GROUP_GAP is always 1. */
2379 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2381 prev_init = DR_INIT (data_ref);
2382 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2383 /* Count the number of data-refs in the chain. */
2384 count++;
2387 if (groupsize == 0)
2388 groupsize = count + gaps;
2390 /* This could be UINT_MAX but as we are generating code in a very
2391 inefficient way we have to cap earlier. See PR78699 for example. */
2392 if (groupsize > 4096)
2394 if (dump_enabled_p ())
2395 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2396 "group is too large\n");
2397 return false;
2400 /* Check that the size of the interleaving is equal to count for stores,
2401 i.e., that there are no gaps. */
2402 if (groupsize != count
2403 && !DR_IS_READ (dr))
2405 if (dump_enabled_p ())
2406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2407 "interleaved store with gaps\n");
2408 return false;
2411 /* If there is a gap after the last load in the group it is the
2412 difference between the groupsize and the last accessed
2413 element.
2414 When there is no gap, this difference should be 0. */
2415 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2417 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2418 if (dump_enabled_p ())
2420 dump_printf_loc (MSG_NOTE, vect_location,
2421 "Detected interleaving ");
2422 if (DR_IS_READ (dr))
2423 dump_printf (MSG_NOTE, "load ");
2424 else
2425 dump_printf (MSG_NOTE, "store ");
2426 dump_printf (MSG_NOTE, "of size %u starting with ",
2427 (unsigned)groupsize);
2428 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2429 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2430 dump_printf_loc (MSG_NOTE, vect_location,
2431 "There is a gap of %u elements after the group\n",
2432 GROUP_GAP (vinfo_for_stmt (stmt)));
2435 /* SLP: create an SLP data structure for every interleaving group of
2436 stores for further analysis in vect_analyse_slp. */
2437 if (DR_IS_WRITE (dr) && !slp_impossible)
2439 if (loop_vinfo)
2440 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2441 if (bb_vinfo)
2442 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2446 return true;
2449 /* Analyze groups of accesses: check that DR belongs to a group of
2450 accesses of legal size, step, etc. Detect gaps, single element
2451 interleaving, and other special cases. Set grouped access info.
2452 Collect groups of strided stores for further use in SLP analysis. */
2454 static bool
2455 vect_analyze_group_access (struct data_reference *dr)
2457 if (!vect_analyze_group_access_1 (dr))
2459 /* Dissolve the group if present. */
2460 gimple *next;
2461 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2462 while (stmt)
2464 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2465 next = GROUP_NEXT_ELEMENT (vinfo);
2466 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2467 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2468 stmt = next;
2470 return false;
2472 return true;
2475 /* Analyze the access pattern of the data-reference DR.
2476 In case of non-consecutive accesses call vect_analyze_group_access() to
2477 analyze groups of accesses. */
2479 static bool
2480 vect_analyze_data_ref_access (struct data_reference *dr)
2482 tree step = DR_STEP (dr);
2483 tree scalar_type = TREE_TYPE (DR_REF (dr));
2484 gimple *stmt = DR_STMT (dr);
2485 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2486 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2487 struct loop *loop = NULL;
2489 if (loop_vinfo)
2490 loop = LOOP_VINFO_LOOP (loop_vinfo);
2492 if (loop_vinfo && !step)
2494 if (dump_enabled_p ())
2495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2496 "bad data-ref access in loop\n");
2497 return false;
2500 /* Allow loads with zero step in inner-loop vectorization. */
2501 if (loop_vinfo && integer_zerop (step))
2503 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2504 if (!nested_in_vect_loop_p (loop, stmt))
2505 return DR_IS_READ (dr);
2506 /* Allow references with zero step for outer loops marked
2507 with pragma omp simd only - it guarantees absence of
2508 loop-carried dependencies between inner loop iterations. */
2509 if (!loop->force_vectorize)
2511 if (dump_enabled_p ())
2512 dump_printf_loc (MSG_NOTE, vect_location,
2513 "zero step in inner loop of nest\n");
2514 return false;
2518 if (loop && nested_in_vect_loop_p (loop, stmt))
2520 /* Interleaved accesses are not yet supported within outer-loop
2521 vectorization for references in the inner-loop. */
2522 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2524 /* For the rest of the analysis we use the outer-loop step. */
2525 step = STMT_VINFO_DR_STEP (stmt_info);
2526 if (integer_zerop (step))
2528 if (dump_enabled_p ())
2529 dump_printf_loc (MSG_NOTE, vect_location,
2530 "zero step in outer loop.\n");
2531 return DR_IS_READ (dr);
2535 /* Consecutive? */
2536 if (TREE_CODE (step) == INTEGER_CST)
2538 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2539 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2540 || (dr_step < 0
2541 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2543 /* Mark that it is not interleaving. */
2544 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2545 return true;
2549 if (loop && nested_in_vect_loop_p (loop, stmt))
2551 if (dump_enabled_p ())
2552 dump_printf_loc (MSG_NOTE, vect_location,
2553 "grouped access in outer loop.\n");
2554 return false;
2558 /* Assume this is a DR handled by non-constant strided load case. */
2559 if (TREE_CODE (step) != INTEGER_CST)
2560 return (STMT_VINFO_STRIDED_P (stmt_info)
2561 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2562 || vect_analyze_group_access (dr)));
2564 /* Not consecutive access - check if it's a part of interleaving group. */
2565 return vect_analyze_group_access (dr);
2570 /* A helper function used in the comparator function to sort data
2571 references. T1 and T2 are two data references to be compared.
2572 The function returns -1, 0, or 1. */
2574 static int
2575 compare_tree (tree t1, tree t2)
2577 int i, cmp;
2578 enum tree_code code;
2579 char tclass;
2581 if (t1 == t2)
2582 return 0;
2583 if (t1 == NULL)
2584 return -1;
2585 if (t2 == NULL)
2586 return 1;
2588 STRIP_NOPS (t1);
2589 STRIP_NOPS (t2);
2591 if (TREE_CODE (t1) != TREE_CODE (t2))
2592 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2594 code = TREE_CODE (t1);
2595 switch (code)
2597 /* For const values, we can just use hash values for comparisons. */
2598 case INTEGER_CST:
2599 case REAL_CST:
2600 case FIXED_CST:
2601 case STRING_CST:
2602 case COMPLEX_CST:
2603 case VECTOR_CST:
2605 hashval_t h1 = iterative_hash_expr (t1, 0);
2606 hashval_t h2 = iterative_hash_expr (t2, 0);
2607 if (h1 != h2)
2608 return h1 < h2 ? -1 : 1;
2609 break;
2612 case SSA_NAME:
2613 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2614 if (cmp != 0)
2615 return cmp;
2617 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2618 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2619 break;
2621 default:
2622 tclass = TREE_CODE_CLASS (code);
2624 /* For var-decl, we could compare their UIDs. */
2625 if (tclass == tcc_declaration)
2627 if (DECL_UID (t1) != DECL_UID (t2))
2628 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2629 break;
2632 /* For expressions with operands, compare their operands recursively. */
2633 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2635 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2636 if (cmp != 0)
2637 return cmp;
2641 return 0;
2645 /* Compare two data-references DRA and DRB to group them into chunks
2646 suitable for grouping. */
2648 static int
2649 dr_group_sort_cmp (const void *dra_, const void *drb_)
2651 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2652 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2653 int cmp;
2655 /* Stabilize sort. */
2656 if (dra == drb)
2657 return 0;
2659 /* DRs in different loops never belong to the same group. */
2660 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2661 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2662 if (loopa != loopb)
2663 return loopa->num < loopb->num ? -1 : 1;
2665 /* Ordering of DRs according to base. */
2666 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2668 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2669 if (cmp != 0)
2670 return cmp;
2673 /* And according to DR_OFFSET. */
2674 if (!dr_equal_offsets_p (dra, drb))
2676 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2677 if (cmp != 0)
2678 return cmp;
2681 /* Put reads before writes. */
2682 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2683 return DR_IS_READ (dra) ? -1 : 1;
2685 /* Then sort after access size. */
2686 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2687 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2689 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2690 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2691 if (cmp != 0)
2692 return cmp;
2695 /* And after step. */
2696 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2698 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2699 if (cmp != 0)
2700 return cmp;
2703 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2704 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2705 if (cmp == 0)
2706 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2707 return cmp;
2710 /* Function vect_analyze_data_ref_accesses.
2712 Analyze the access pattern of all the data references in the loop.
2714 FORNOW: the only access pattern that is considered vectorizable is a
2715 simple step 1 (consecutive) access.
2717 FORNOW: handle only arrays and pointer accesses. */
2719 bool
2720 vect_analyze_data_ref_accesses (vec_info *vinfo)
2722 unsigned int i;
2723 vec<data_reference_p> datarefs = vinfo->datarefs;
2724 struct data_reference *dr;
2726 if (dump_enabled_p ())
2727 dump_printf_loc (MSG_NOTE, vect_location,
2728 "=== vect_analyze_data_ref_accesses ===\n");
2730 if (datarefs.is_empty ())
2731 return true;
2733 /* Sort the array of datarefs to make building the interleaving chains
2734 linear. Don't modify the original vector's order, it is needed for
2735 determining what dependencies are reversed. */
2736 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2737 datarefs_copy.qsort (dr_group_sort_cmp);
2739 /* Build the interleaving chains. */
2740 for (i = 0; i < datarefs_copy.length () - 1;)
2742 data_reference_p dra = datarefs_copy[i];
2743 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2744 stmt_vec_info lastinfo = NULL;
2745 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2747 ++i;
2748 continue;
2750 for (i = i + 1; i < datarefs_copy.length (); ++i)
2752 data_reference_p drb = datarefs_copy[i];
2753 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2754 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2755 break;
2757 /* ??? Imperfect sorting (non-compatible types, non-modulo
2758 accesses, same accesses) can lead to a group to be artificially
2759 split here as we don't just skip over those. If it really
2760 matters we can push those to a worklist and re-iterate
2761 over them. The we can just skip ahead to the next DR here. */
2763 /* DRs in a different loop should not be put into the same
2764 interleaving group. */
2765 if (gimple_bb (DR_STMT (dra))->loop_father
2766 != gimple_bb (DR_STMT (drb))->loop_father)
2767 break;
2769 /* Check that the data-refs have same first location (except init)
2770 and they are both either store or load (not load and store,
2771 not masked loads or stores). */
2772 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2773 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2774 DR_BASE_ADDRESS (drb), 0)
2775 || !dr_equal_offsets_p (dra, drb)
2776 || !gimple_assign_single_p (DR_STMT (dra))
2777 || !gimple_assign_single_p (DR_STMT (drb)))
2778 break;
2780 /* Check that the data-refs have the same constant size. */
2781 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2782 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2783 if (!tree_fits_uhwi_p (sza)
2784 || !tree_fits_uhwi_p (szb)
2785 || !tree_int_cst_equal (sza, szb))
2786 break;
2788 /* Check that the data-refs have the same step. */
2789 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2790 break;
2792 /* Do not place the same access in the interleaving chain twice. */
2793 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2794 break;
2796 /* Check the types are compatible.
2797 ??? We don't distinguish this during sorting. */
2798 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2799 TREE_TYPE (DR_REF (drb))))
2800 break;
2802 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2803 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2804 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2805 gcc_assert (init_a <= init_b);
2807 /* If init_b == init_a + the size of the type * k, we have an
2808 interleaving, and DRA is accessed before DRB. */
2809 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2810 if (type_size_a == 0
2811 || (init_b - init_a) % type_size_a != 0)
2812 break;
2814 /* If we have a store, the accesses are adjacent. This splits
2815 groups into chunks we support (we don't support vectorization
2816 of stores with gaps). */
2817 if (!DR_IS_READ (dra)
2818 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2819 (DR_INIT (datarefs_copy[i-1]))
2820 != type_size_a))
2821 break;
2823 /* If the step (if not zero or non-constant) is greater than the
2824 difference between data-refs' inits this splits groups into
2825 suitable sizes. */
2826 if (tree_fits_shwi_p (DR_STEP (dra)))
2828 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2829 if (step != 0 && step <= (init_b - init_a))
2830 break;
2833 if (dump_enabled_p ())
2835 dump_printf_loc (MSG_NOTE, vect_location,
2836 "Detected interleaving ");
2837 if (DR_IS_READ (dra))
2838 dump_printf (MSG_NOTE, "load ");
2839 else
2840 dump_printf (MSG_NOTE, "store ");
2841 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2842 dump_printf (MSG_NOTE, " and ");
2843 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2844 dump_printf (MSG_NOTE, "\n");
2847 /* Link the found element into the group list. */
2848 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2850 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2851 lastinfo = stmtinfo_a;
2853 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2854 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2855 lastinfo = stmtinfo_b;
2859 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2860 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2861 && !vect_analyze_data_ref_access (dr))
2863 if (dump_enabled_p ())
2864 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2865 "not vectorized: complicated access pattern.\n");
2867 if (is_a <bb_vec_info> (vinfo))
2869 /* Mark the statement as not vectorizable. */
2870 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2871 continue;
2873 else
2875 datarefs_copy.release ();
2876 return false;
2880 datarefs_copy.release ();
2881 return true;
2885 /* Operator == between two dr_with_seg_len objects.
2887 This equality operator is used to make sure two data refs
2888 are the same one so that we will consider to combine the
2889 aliasing checks of those two pairs of data dependent data
2890 refs. */
2892 static bool
2893 operator == (const dr_with_seg_len& d1,
2894 const dr_with_seg_len& d2)
2896 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2897 DR_BASE_ADDRESS (d2.dr), 0)
2898 && compare_tree (DR_OFFSET (d1.dr), DR_OFFSET (d2.dr)) == 0
2899 && compare_tree (DR_INIT (d1.dr), DR_INIT (d2.dr)) == 0
2900 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2903 /* Function comp_dr_with_seg_len_pair.
2905 Comparison function for sorting objects of dr_with_seg_len_pair_t
2906 so that we can combine aliasing checks in one scan. */
2908 static int
2909 comp_dr_with_seg_len_pair (const void *pa_, const void *pb_)
2911 const dr_with_seg_len_pair_t* pa = (const dr_with_seg_len_pair_t *) pa_;
2912 const dr_with_seg_len_pair_t* pb = (const dr_with_seg_len_pair_t *) pb_;
2913 const dr_with_seg_len &a1 = pa->first, &a2 = pa->second;
2914 const dr_with_seg_len &b1 = pb->first, &b2 = pb->second;
2916 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2917 if a and c have the same basic address snd step, and b and d have the same
2918 address and step. Therefore, if any a&c or b&d don't have the same address
2919 and step, we don't care the order of those two pairs after sorting. */
2920 int comp_res;
2922 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a1.dr),
2923 DR_BASE_ADDRESS (b1.dr))) != 0)
2924 return comp_res;
2925 if ((comp_res = compare_tree (DR_BASE_ADDRESS (a2.dr),
2926 DR_BASE_ADDRESS (b2.dr))) != 0)
2927 return comp_res;
2928 if ((comp_res = compare_tree (DR_STEP (a1.dr), DR_STEP (b1.dr))) != 0)
2929 return comp_res;
2930 if ((comp_res = compare_tree (DR_STEP (a2.dr), DR_STEP (b2.dr))) != 0)
2931 return comp_res;
2932 if ((comp_res = compare_tree (DR_OFFSET (a1.dr), DR_OFFSET (b1.dr))) != 0)
2933 return comp_res;
2934 if ((comp_res = compare_tree (DR_INIT (a1.dr), DR_INIT (b1.dr))) != 0)
2935 return comp_res;
2936 if ((comp_res = compare_tree (DR_OFFSET (a2.dr), DR_OFFSET (b2.dr))) != 0)
2937 return comp_res;
2938 if ((comp_res = compare_tree (DR_INIT (a2.dr), DR_INIT (b2.dr))) != 0)
2939 return comp_res;
2941 return 0;
2944 /* Function vect_vfa_segment_size.
2946 Create an expression that computes the size of segment
2947 that will be accessed for a data reference. The functions takes into
2948 account that realignment loads may access one more vector.
2950 Input:
2951 DR: The data reference.
2952 LENGTH_FACTOR: segment length to consider.
2954 Return an expression whose value is the size of segment which will be
2955 accessed by DR. */
2957 static tree
2958 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2960 tree segment_length;
2962 if (integer_zerop (DR_STEP (dr)))
2963 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2964 else
2965 segment_length = size_binop (MULT_EXPR,
2966 fold_convert (sizetype, DR_STEP (dr)),
2967 fold_convert (sizetype, length_factor));
2969 if (vect_supportable_dr_alignment (dr, false)
2970 == dr_explicit_realign_optimized)
2972 tree vector_size = TYPE_SIZE_UNIT
2973 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2975 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2977 return segment_length;
2980 /* Function vect_no_alias_p.
2982 Given data references A and B with equal base and offset, the alias
2983 relation can be decided at compilation time, return TRUE if they do
2984 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2985 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2986 respectively. */
2988 static bool
2989 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2990 tree segment_length_a, tree segment_length_b)
2992 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2993 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2994 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2995 return false;
2997 tree seg_a_min = DR_INIT (a);
2998 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2999 seg_a_min, segment_length_a);
3000 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3001 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3002 [a, a+12) */
3003 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
3005 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
3006 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
3007 seg_a_max, unit_size);
3008 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
3009 DR_INIT (a), unit_size);
3011 tree seg_b_min = DR_INIT (b);
3012 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
3013 seg_b_min, segment_length_b);
3014 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
3016 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
3017 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
3018 seg_b_max, unit_size);
3019 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
3020 DR_INIT (b), unit_size);
3023 if (tree_int_cst_le (seg_a_max, seg_b_min)
3024 || tree_int_cst_le (seg_b_max, seg_a_min))
3025 return true;
3027 return false;
3030 /* Function vect_prune_runtime_alias_test_list.
3032 Prune a list of ddrs to be tested at run-time by versioning for alias.
3033 Merge several alias checks into one if possible.
3034 Return FALSE if resulting list of ddrs is longer then allowed by
3035 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
3037 bool
3038 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3040 vec<ddr_p> may_alias_ddrs =
3041 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3042 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
3043 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3044 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3045 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3047 ddr_p ddr;
3048 unsigned int i;
3049 tree length_factor;
3051 if (dump_enabled_p ())
3052 dump_printf_loc (MSG_NOTE, vect_location,
3053 "=== vect_prune_runtime_alias_test_list ===\n");
3055 if (may_alias_ddrs.is_empty ())
3056 return true;
3058 /* Basically, for each pair of dependent data refs store_ptr_0
3059 and load_ptr_0, we create an expression:
3061 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3062 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
3064 for aliasing checks. However, in some cases we can decrease
3065 the number of checks by combining two checks into one. For
3066 example, suppose we have another pair of data refs store_ptr_0
3067 and load_ptr_1, and if the following condition is satisfied:
3069 load_ptr_0 < load_ptr_1 &&
3070 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
3072 (this condition means, in each iteration of vectorized loop,
3073 the accessed memory of store_ptr_0 cannot be between the memory
3074 of load_ptr_0 and load_ptr_1.)
3076 we then can use only the following expression to finish the
3077 alising checks between store_ptr_0 & load_ptr_0 and
3078 store_ptr_0 & load_ptr_1:
3080 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
3081 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
3083 Note that we only consider that load_ptr_0 and load_ptr_1 have the
3084 same basic address. */
3086 comp_alias_ddrs.create (may_alias_ddrs.length ());
3088 /* First, we collect all data ref pairs for aliasing checks. */
3089 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3091 int comp_res;
3092 struct data_reference *dr_a, *dr_b;
3093 gimple *dr_group_first_a, *dr_group_first_b;
3094 tree segment_length_a, segment_length_b;
3095 gimple *stmt_a, *stmt_b;
3097 dr_a = DDR_A (ddr);
3098 stmt_a = DR_STMT (DDR_A (ddr));
3099 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3100 if (dr_group_first_a)
3102 stmt_a = dr_group_first_a;
3103 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3106 dr_b = DDR_B (ddr);
3107 stmt_b = DR_STMT (DDR_B (ddr));
3108 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3109 if (dr_group_first_b)
3111 stmt_b = dr_group_first_b;
3112 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3115 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3116 length_factor = scalar_loop_iters;
3117 else
3118 length_factor = size_int (vect_factor);
3119 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3120 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3122 comp_res = compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b));
3123 if (comp_res == 0)
3124 comp_res = compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
3126 /* Alias is known at compilation time. */
3127 if (comp_res == 0
3128 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3129 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3130 && TREE_CODE (segment_length_a) == INTEGER_CST
3131 && TREE_CODE (segment_length_b) == INTEGER_CST)
3133 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3134 continue;
3136 if (dump_enabled_p ())
3137 dump_printf_loc (MSG_NOTE, vect_location,
3138 "not vectorized: compilation time alias.\n");
3140 return false;
3143 dr_with_seg_len_pair_t dr_with_seg_len_pair
3144 (dr_with_seg_len (dr_a, segment_length_a),
3145 dr_with_seg_len (dr_b, segment_length_b));
3147 /* Canonicalize pairs by sorting the two DR members. */
3148 if (comp_res > 0)
3149 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3151 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3154 /* Second, we sort the collected data ref pairs so that we can scan
3155 them once to combine all possible aliasing checks. */
3156 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
3158 /* Third, we scan the sorted dr pairs and check if we can combine
3159 alias checks of two neighboring dr pairs. */
3160 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
3162 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
3163 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
3164 *dr_b1 = &comp_alias_ddrs[i-1].second,
3165 *dr_a2 = &comp_alias_ddrs[i].first,
3166 *dr_b2 = &comp_alias_ddrs[i].second;
3168 /* Remove duplicate data ref pairs. */
3169 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
3171 if (dump_enabled_p ())
3173 dump_printf_loc (MSG_NOTE, vect_location,
3174 "found equal ranges ");
3175 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3176 DR_REF (dr_a1->dr));
3177 dump_printf (MSG_NOTE, ", ");
3178 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3179 DR_REF (dr_b1->dr));
3180 dump_printf (MSG_NOTE, " and ");
3181 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3182 DR_REF (dr_a2->dr));
3183 dump_printf (MSG_NOTE, ", ");
3184 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3185 DR_REF (dr_b2->dr));
3186 dump_printf (MSG_NOTE, "\n");
3189 comp_alias_ddrs.ordered_remove (i--);
3190 continue;
3193 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3195 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3196 and DR_A1 and DR_A2 are two consecutive memrefs. */
3197 if (*dr_a1 == *dr_a2)
3199 std::swap (dr_a1, dr_b1);
3200 std::swap (dr_a2, dr_b2);
3203 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3204 DR_BASE_ADDRESS (dr_a2->dr), 0)
3205 || !operand_equal_p (DR_OFFSET (dr_a1->dr),
3206 DR_OFFSET (dr_a2->dr), 0)
3207 || !tree_fits_shwi_p (DR_INIT (dr_a1->dr))
3208 || !tree_fits_shwi_p (DR_INIT (dr_a2->dr)))
3209 continue;
3211 /* Make sure dr_a1 starts left of dr_a2. */
3212 if (tree_int_cst_lt (DR_INIT (dr_a2->dr), DR_INIT (dr_a1->dr)))
3213 std::swap (*dr_a1, *dr_a2);
3215 bool do_remove = false;
3216 unsigned HOST_WIDE_INT diff
3217 = (tree_to_shwi (DR_INIT (dr_a2->dr))
3218 - tree_to_shwi (DR_INIT (dr_a1->dr)));
3220 /* If the left segment does not extend beyond the start of the
3221 right segment the new segment length is that of the right
3222 plus the segment distance. */
3223 if (tree_fits_uhwi_p (dr_a1->seg_len)
3224 && compare_tree_int (dr_a1->seg_len, diff) <= 0)
3226 dr_a1->seg_len = size_binop (PLUS_EXPR, dr_a2->seg_len,
3227 size_int (diff));
3228 do_remove = true;
3230 /* Generally the new segment length is the maximum of the
3231 left segment size and the right segment size plus the distance.
3232 ??? We can also build tree MAX_EXPR here but it's not clear this
3233 is profitable. */
3234 else if (tree_fits_uhwi_p (dr_a1->seg_len)
3235 && tree_fits_uhwi_p (dr_a2->seg_len))
3237 unsigned HOST_WIDE_INT seg_len_a1 = tree_to_uhwi (dr_a1->seg_len);
3238 unsigned HOST_WIDE_INT seg_len_a2 = tree_to_uhwi (dr_a2->seg_len);
3239 dr_a1->seg_len = size_int (MAX (seg_len_a1, diff + seg_len_a2));
3240 do_remove = true;
3242 /* Now we check if the following condition is satisfied:
3244 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3246 where DIFF = DR_A2_INIT - DR_A1_INIT. However,
3247 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3248 have to make a best estimation. We can get the minimum value
3249 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3250 then either of the following two conditions can guarantee the
3251 one above:
3253 1: DIFF <= MIN_SEG_LEN_B
3254 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B */
3255 else
3257 unsigned HOST_WIDE_INT min_seg_len_b
3258 = (tree_fits_uhwi_p (dr_b1->seg_len)
3259 ? tree_to_uhwi (dr_b1->seg_len)
3260 : vect_factor);
3262 if (diff <= min_seg_len_b
3263 || (tree_fits_uhwi_p (dr_a1->seg_len)
3264 && diff - tree_to_uhwi (dr_a1->seg_len) < min_seg_len_b))
3266 dr_a1->seg_len = size_binop (PLUS_EXPR,
3267 dr_a2->seg_len, size_int (diff));
3268 do_remove = true;
3272 if (do_remove)
3274 if (dump_enabled_p ())
3276 dump_printf_loc (MSG_NOTE, vect_location,
3277 "merging ranges for ");
3278 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a1->dr));
3279 dump_printf (MSG_NOTE, ", ");
3280 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b1->dr));
3281 dump_printf (MSG_NOTE, " and ");
3282 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a2->dr));
3283 dump_printf (MSG_NOTE, ", ");
3284 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b2->dr));
3285 dump_printf (MSG_NOTE, "\n");
3287 comp_alias_ddrs.ordered_remove (i--);
3292 dump_printf_loc (MSG_NOTE, vect_location,
3293 "improved number of alias checks from %d to %d\n",
3294 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3295 if ((int) comp_alias_ddrs.length () >
3296 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3298 if (dump_enabled_p ())
3299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3300 "number of versioning for alias "
3301 "run-time tests exceeds %d "
3302 "(--param vect-max-version-for-alias-checks)\n",
3303 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3304 return false;
3307 /* All alias checks have been resolved at compilation time. */
3308 if (!comp_alias_ddrs.length ())
3309 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3311 return true;
3314 /* Return true if a non-affine read or write in STMT is suitable for a
3315 gather load or scatter store. Describe the operation in *INFO if so. */
3317 bool
3318 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3319 gather_scatter_info *info)
3321 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3322 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3323 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3324 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3325 tree offtype = NULL_TREE;
3326 tree decl, base, off;
3327 machine_mode pmode;
3328 int punsignedp, reversep, pvolatilep = 0;
3330 base = DR_REF (dr);
3331 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3332 see if we can use the def stmt of the address. */
3333 if (is_gimple_call (stmt)
3334 && gimple_call_internal_p (stmt)
3335 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3336 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3337 && TREE_CODE (base) == MEM_REF
3338 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3339 && integer_zerop (TREE_OPERAND (base, 1))
3340 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3342 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3343 if (is_gimple_assign (def_stmt)
3344 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3345 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3348 /* The gather and scatter builtins need address of the form
3349 loop_invariant + vector * {1, 2, 4, 8}
3351 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3352 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3353 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3354 multiplications and additions in it. To get a vector, we need
3355 a single SSA_NAME that will be defined in the loop and will
3356 contain everything that is not loop invariant and that can be
3357 vectorized. The following code attempts to find such a preexistng
3358 SSA_NAME OFF and put the loop invariants into a tree BASE
3359 that can be gimplified before the loop. */
3360 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3361 &punsignedp, &reversep, &pvolatilep);
3362 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3364 if (TREE_CODE (base) == MEM_REF)
3366 if (!integer_zerop (TREE_OPERAND (base, 1)))
3368 if (off == NULL_TREE)
3370 offset_int moff = mem_ref_offset (base);
3371 off = wide_int_to_tree (sizetype, moff);
3373 else
3374 off = size_binop (PLUS_EXPR, off,
3375 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3377 base = TREE_OPERAND (base, 0);
3379 else
3380 base = build_fold_addr_expr (base);
3382 if (off == NULL_TREE)
3383 off = size_zero_node;
3385 /* If base is not loop invariant, either off is 0, then we start with just
3386 the constant offset in the loop invariant BASE and continue with base
3387 as OFF, otherwise give up.
3388 We could handle that case by gimplifying the addition of base + off
3389 into some SSA_NAME and use that as off, but for now punt. */
3390 if (!expr_invariant_in_loop_p (loop, base))
3392 if (!integer_zerop (off))
3393 return false;
3394 off = base;
3395 base = size_int (pbitpos / BITS_PER_UNIT);
3397 /* Otherwise put base + constant offset into the loop invariant BASE
3398 and continue with OFF. */
3399 else
3401 base = fold_convert (sizetype, base);
3402 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3405 /* OFF at this point may be either a SSA_NAME or some tree expression
3406 from get_inner_reference. Try to peel off loop invariants from it
3407 into BASE as long as possible. */
3408 STRIP_NOPS (off);
3409 while (offtype == NULL_TREE)
3411 enum tree_code code;
3412 tree op0, op1, add = NULL_TREE;
3414 if (TREE_CODE (off) == SSA_NAME)
3416 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3418 if (expr_invariant_in_loop_p (loop, off))
3419 return false;
3421 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3422 break;
3424 op0 = gimple_assign_rhs1 (def_stmt);
3425 code = gimple_assign_rhs_code (def_stmt);
3426 op1 = gimple_assign_rhs2 (def_stmt);
3428 else
3430 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3431 return false;
3432 code = TREE_CODE (off);
3433 extract_ops_from_tree (off, &code, &op0, &op1);
3435 switch (code)
3437 case POINTER_PLUS_EXPR:
3438 case PLUS_EXPR:
3439 if (expr_invariant_in_loop_p (loop, op0))
3441 add = op0;
3442 off = op1;
3443 do_add:
3444 add = fold_convert (sizetype, add);
3445 if (scale != 1)
3446 add = size_binop (MULT_EXPR, add, size_int (scale));
3447 base = size_binop (PLUS_EXPR, base, add);
3448 continue;
3450 if (expr_invariant_in_loop_p (loop, op1))
3452 add = op1;
3453 off = op0;
3454 goto do_add;
3456 break;
3457 case MINUS_EXPR:
3458 if (expr_invariant_in_loop_p (loop, op1))
3460 add = fold_convert (sizetype, op1);
3461 add = size_binop (MINUS_EXPR, size_zero_node, add);
3462 off = op0;
3463 goto do_add;
3465 break;
3466 case MULT_EXPR:
3467 if (scale == 1 && tree_fits_shwi_p (op1))
3469 scale = tree_to_shwi (op1);
3470 off = op0;
3471 continue;
3473 break;
3474 case SSA_NAME:
3475 off = op0;
3476 continue;
3477 CASE_CONVERT:
3478 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3479 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3480 break;
3481 if (TYPE_PRECISION (TREE_TYPE (op0))
3482 == TYPE_PRECISION (TREE_TYPE (off)))
3484 off = op0;
3485 continue;
3487 if (TYPE_PRECISION (TREE_TYPE (op0))
3488 < TYPE_PRECISION (TREE_TYPE (off)))
3490 off = op0;
3491 offtype = TREE_TYPE (off);
3492 STRIP_NOPS (off);
3493 continue;
3495 break;
3496 default:
3497 break;
3499 break;
3502 /* If at the end OFF still isn't a SSA_NAME or isn't
3503 defined in the loop, punt. */
3504 if (TREE_CODE (off) != SSA_NAME
3505 || expr_invariant_in_loop_p (loop, off))
3506 return false;
3508 if (offtype == NULL_TREE)
3509 offtype = TREE_TYPE (off);
3511 if (DR_IS_READ (dr))
3512 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3513 offtype, scale);
3514 else
3515 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3516 offtype, scale);
3518 if (decl == NULL_TREE)
3519 return false;
3521 info->decl = decl;
3522 info->base = base;
3523 info->offset = off;
3524 info->offset_dt = vect_unknown_def_type;
3525 info->offset_vectype = NULL_TREE;
3526 info->scale = scale;
3527 return true;
3530 /* Function vect_analyze_data_refs.
3532 Find all the data references in the loop or basic block.
3534 The general structure of the analysis of data refs in the vectorizer is as
3535 follows:
3536 1- vect_analyze_data_refs(loop/bb): call
3537 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3538 in the loop/bb and their dependences.
3539 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3540 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3541 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3545 bool
3546 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3548 struct loop *loop = NULL;
3549 unsigned int i;
3550 struct data_reference *dr;
3551 tree scalar_type;
3553 if (dump_enabled_p ())
3554 dump_printf_loc (MSG_NOTE, vect_location,
3555 "=== vect_analyze_data_refs ===\n");
3557 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3558 loop = LOOP_VINFO_LOOP (loop_vinfo);
3560 /* Go through the data-refs, check that the analysis succeeded. Update
3561 pointer from stmt_vec_info struct to DR and vectype. */
3563 vec<data_reference_p> datarefs = vinfo->datarefs;
3564 FOR_EACH_VEC_ELT (datarefs, i, dr)
3566 gimple *stmt;
3567 stmt_vec_info stmt_info;
3568 tree base, offset, init;
3569 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3570 bool simd_lane_access = false;
3571 int vf;
3573 again:
3574 if (!dr || !DR_REF (dr))
3576 if (dump_enabled_p ())
3577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3578 "not vectorized: unhandled data-ref\n");
3579 return false;
3582 stmt = DR_STMT (dr);
3583 stmt_info = vinfo_for_stmt (stmt);
3585 /* Discard clobbers from the dataref vector. We will remove
3586 clobber stmts during vectorization. */
3587 if (gimple_clobber_p (stmt))
3589 free_data_ref (dr);
3590 if (i == datarefs.length () - 1)
3592 datarefs.pop ();
3593 break;
3595 datarefs.ordered_remove (i);
3596 dr = datarefs[i];
3597 goto again;
3600 /* Check that analysis of the data-ref succeeded. */
3601 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3602 || !DR_STEP (dr))
3604 bool maybe_gather
3605 = DR_IS_READ (dr)
3606 && !TREE_THIS_VOLATILE (DR_REF (dr))
3607 && targetm.vectorize.builtin_gather != NULL;
3608 bool maybe_scatter
3609 = DR_IS_WRITE (dr)
3610 && !TREE_THIS_VOLATILE (DR_REF (dr))
3611 && targetm.vectorize.builtin_scatter != NULL;
3612 bool maybe_simd_lane_access
3613 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3615 /* If target supports vector gather loads or scatter stores, or if
3616 this might be a SIMD lane access, see if they can't be used. */
3617 if (is_a <loop_vec_info> (vinfo)
3618 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3619 && !nested_in_vect_loop_p (loop, stmt))
3621 struct data_reference *newdr
3622 = create_data_ref (NULL, loop_containing_stmt (stmt),
3623 DR_REF (dr), stmt, maybe_scatter ? false : true);
3624 gcc_assert (newdr != NULL && DR_REF (newdr));
3625 if (DR_BASE_ADDRESS (newdr)
3626 && DR_OFFSET (newdr)
3627 && DR_INIT (newdr)
3628 && DR_STEP (newdr)
3629 && integer_zerop (DR_STEP (newdr)))
3631 if (maybe_simd_lane_access)
3633 tree off = DR_OFFSET (newdr);
3634 STRIP_NOPS (off);
3635 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3636 && TREE_CODE (off) == MULT_EXPR
3637 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3639 tree step = TREE_OPERAND (off, 1);
3640 off = TREE_OPERAND (off, 0);
3641 STRIP_NOPS (off);
3642 if (CONVERT_EXPR_P (off)
3643 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3644 0)))
3645 < TYPE_PRECISION (TREE_TYPE (off)))
3646 off = TREE_OPERAND (off, 0);
3647 if (TREE_CODE (off) == SSA_NAME)
3649 gimple *def = SSA_NAME_DEF_STMT (off);
3650 tree reft = TREE_TYPE (DR_REF (newdr));
3651 if (is_gimple_call (def)
3652 && gimple_call_internal_p (def)
3653 && (gimple_call_internal_fn (def)
3654 == IFN_GOMP_SIMD_LANE))
3656 tree arg = gimple_call_arg (def, 0);
3657 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3658 arg = SSA_NAME_VAR (arg);
3659 if (arg == loop->simduid
3660 /* For now. */
3661 && tree_int_cst_equal
3662 (TYPE_SIZE_UNIT (reft),
3663 step))
3665 DR_OFFSET (newdr) = ssize_int (0);
3666 DR_STEP (newdr) = step;
3667 DR_ALIGNED_TO (newdr)
3668 = size_int (BIGGEST_ALIGNMENT);
3669 dr = newdr;
3670 simd_lane_access = true;
3676 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3678 dr = newdr;
3679 if (maybe_gather)
3680 gatherscatter = GATHER;
3681 else
3682 gatherscatter = SCATTER;
3685 if (gatherscatter == SG_NONE && !simd_lane_access)
3686 free_data_ref (newdr);
3689 if (gatherscatter == SG_NONE && !simd_lane_access)
3691 if (dump_enabled_p ())
3693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3694 "not vectorized: data ref analysis "
3695 "failed ");
3696 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3699 if (is_a <bb_vec_info> (vinfo))
3700 break;
3702 return false;
3706 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3708 if (dump_enabled_p ())
3709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3710 "not vectorized: base addr of dr is a "
3711 "constant\n");
3713 if (is_a <bb_vec_info> (vinfo))
3714 break;
3716 if (gatherscatter != SG_NONE || simd_lane_access)
3717 free_data_ref (dr);
3718 return false;
3721 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3723 if (dump_enabled_p ())
3725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3726 "not vectorized: volatile type ");
3727 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3730 if (is_a <bb_vec_info> (vinfo))
3731 break;
3733 return false;
3736 if (stmt_can_throw_internal (stmt))
3738 if (dump_enabled_p ())
3740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3741 "not vectorized: statement can throw an "
3742 "exception ");
3743 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3746 if (is_a <bb_vec_info> (vinfo))
3747 break;
3749 if (gatherscatter != SG_NONE || simd_lane_access)
3750 free_data_ref (dr);
3751 return false;
3754 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3755 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3757 if (dump_enabled_p ())
3759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3760 "not vectorized: statement is bitfield "
3761 "access ");
3762 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3765 if (is_a <bb_vec_info> (vinfo))
3766 break;
3768 if (gatherscatter != SG_NONE || simd_lane_access)
3769 free_data_ref (dr);
3770 return false;
3773 base = unshare_expr (DR_BASE_ADDRESS (dr));
3774 offset = unshare_expr (DR_OFFSET (dr));
3775 init = unshare_expr (DR_INIT (dr));
3777 if (is_gimple_call (stmt)
3778 && (!gimple_call_internal_p (stmt)
3779 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3780 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3782 if (dump_enabled_p ())
3784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3785 "not vectorized: dr in a call ");
3786 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3789 if (is_a <bb_vec_info> (vinfo))
3790 break;
3792 if (gatherscatter != SG_NONE || simd_lane_access)
3793 free_data_ref (dr);
3794 return false;
3797 /* Update DR field in stmt_vec_info struct. */
3799 /* If the dataref is in an inner-loop of the loop that is considered for
3800 for vectorization, we also want to analyze the access relative to
3801 the outer-loop (DR contains information only relative to the
3802 inner-most enclosing loop). We do that by building a reference to the
3803 first location accessed by the inner-loop, and analyze it relative to
3804 the outer-loop. */
3805 if (loop && nested_in_vect_loop_p (loop, stmt))
3807 tree outer_step, outer_base, outer_init;
3808 HOST_WIDE_INT pbitsize, pbitpos;
3809 tree poffset;
3810 machine_mode pmode;
3811 int punsignedp, preversep, pvolatilep;
3812 affine_iv base_iv, offset_iv;
3813 tree dinit;
3815 /* Build a reference to the first location accessed by the
3816 inner-loop: *(BASE+INIT). (The first location is actually
3817 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3818 tree inner_base = build_fold_indirect_ref
3819 (fold_build_pointer_plus (base, init));
3821 if (dump_enabled_p ())
3823 dump_printf_loc (MSG_NOTE, vect_location,
3824 "analyze in outer-loop: ");
3825 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3826 dump_printf (MSG_NOTE, "\n");
3829 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3830 &poffset, &pmode, &punsignedp,
3831 &preversep, &pvolatilep);
3832 gcc_assert (outer_base != NULL_TREE);
3834 if (pbitpos % BITS_PER_UNIT != 0)
3836 if (dump_enabled_p ())
3837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3838 "failed: bit offset alignment.\n");
3839 return false;
3842 if (preversep)
3844 if (dump_enabled_p ())
3845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3846 "failed: reverse storage order.\n");
3847 return false;
3850 outer_base = build_fold_addr_expr (outer_base);
3851 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3852 &base_iv, false))
3854 if (dump_enabled_p ())
3855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3856 "failed: evolution of base is not affine.\n");
3857 return false;
3860 if (offset)
3862 if (poffset)
3863 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3864 poffset);
3865 else
3866 poffset = offset;
3869 if (!poffset)
3871 offset_iv.base = ssize_int (0);
3872 offset_iv.step = ssize_int (0);
3874 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3875 &offset_iv, false))
3877 if (dump_enabled_p ())
3878 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3879 "evolution of offset is not affine.\n");
3880 return false;
3883 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3884 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3885 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3886 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3887 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3889 outer_step = size_binop (PLUS_EXPR,
3890 fold_convert (ssizetype, base_iv.step),
3891 fold_convert (ssizetype, offset_iv.step));
3893 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3894 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3895 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3896 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3897 STMT_VINFO_DR_OFFSET (stmt_info) =
3898 fold_convert (ssizetype, offset_iv.base);
3899 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3900 size_int (highest_pow2_factor (offset_iv.base));
3902 if (dump_enabled_p ())
3904 dump_printf_loc (MSG_NOTE, vect_location,
3905 "\touter base_address: ");
3906 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3907 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3908 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3909 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3910 STMT_VINFO_DR_OFFSET (stmt_info));
3911 dump_printf (MSG_NOTE,
3912 "\n\touter constant offset from base address: ");
3913 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3914 STMT_VINFO_DR_INIT (stmt_info));
3915 dump_printf (MSG_NOTE, "\n\touter step: ");
3916 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3917 STMT_VINFO_DR_STEP (stmt_info));
3918 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3919 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3920 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3921 dump_printf (MSG_NOTE, "\n");
3925 if (STMT_VINFO_DATA_REF (stmt_info))
3927 if (dump_enabled_p ())
3929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3930 "not vectorized: more than one data ref "
3931 "in stmt: ");
3932 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3935 if (is_a <bb_vec_info> (vinfo))
3936 break;
3938 if (gatherscatter != SG_NONE || simd_lane_access)
3939 free_data_ref (dr);
3940 return false;
3943 STMT_VINFO_DATA_REF (stmt_info) = dr;
3944 if (simd_lane_access)
3946 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3947 free_data_ref (datarefs[i]);
3948 datarefs[i] = dr;
3951 /* Set vectype for STMT. */
3952 scalar_type = TREE_TYPE (DR_REF (dr));
3953 STMT_VINFO_VECTYPE (stmt_info)
3954 = get_vectype_for_scalar_type (scalar_type);
3955 if (!STMT_VINFO_VECTYPE (stmt_info))
3957 if (dump_enabled_p ())
3959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3960 "not vectorized: no vectype for stmt: ");
3961 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3962 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3963 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3964 scalar_type);
3965 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3968 if (is_a <bb_vec_info> (vinfo))
3970 /* No vector type is fine, the ref can still participate
3971 in dependence analysis, we just can't vectorize it. */
3972 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3973 continue;
3976 if (gatherscatter != SG_NONE || simd_lane_access)
3978 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3979 if (gatherscatter != SG_NONE)
3980 free_data_ref (dr);
3982 return false;
3984 else
3986 if (dump_enabled_p ())
3988 dump_printf_loc (MSG_NOTE, vect_location,
3989 "got vectype for stmt: ");
3990 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3991 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3992 STMT_VINFO_VECTYPE (stmt_info));
3993 dump_printf (MSG_NOTE, "\n");
3997 /* Adjust the minimal vectorization factor according to the
3998 vector type. */
3999 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4000 if (vf > *min_vf)
4001 *min_vf = vf;
4003 if (gatherscatter != SG_NONE)
4005 gather_scatter_info gs_info;
4006 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
4007 &gs_info)
4008 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4010 STMT_VINFO_DATA_REF (stmt_info) = NULL;
4011 free_data_ref (dr);
4012 if (dump_enabled_p ())
4014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4015 (gatherscatter == GATHER) ?
4016 "not vectorized: not suitable for gather "
4017 "load " :
4018 "not vectorized: not suitable for scatter "
4019 "store ");
4020 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4022 return false;
4025 free_data_ref (datarefs[i]);
4026 datarefs[i] = dr;
4027 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4030 else if (is_a <loop_vec_info> (vinfo)
4031 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4033 if (nested_in_vect_loop_p (loop, stmt))
4035 if (dump_enabled_p ())
4037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4038 "not vectorized: not suitable for strided "
4039 "load ");
4040 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4042 return false;
4044 STMT_VINFO_STRIDED_P (stmt_info) = true;
4048 /* If we stopped analysis at the first dataref we could not analyze
4049 when trying to vectorize a basic-block mark the rest of the datarefs
4050 as not vectorizable and truncate the vector of datarefs. That
4051 avoids spending useless time in analyzing their dependence. */
4052 if (i != datarefs.length ())
4054 gcc_assert (is_a <bb_vec_info> (vinfo));
4055 for (unsigned j = i; j < datarefs.length (); ++j)
4057 data_reference_p dr = datarefs[j];
4058 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
4059 free_data_ref (dr);
4061 datarefs.truncate (i);
4064 return true;
4068 /* Function vect_get_new_vect_var.
4070 Returns a name for a new variable. The current naming scheme appends the
4071 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4072 the name of vectorizer generated variables, and appends that to NAME if
4073 provided. */
4075 tree
4076 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4078 const char *prefix;
4079 tree new_vect_var;
4081 switch (var_kind)
4083 case vect_simple_var:
4084 prefix = "vect";
4085 break;
4086 case vect_scalar_var:
4087 prefix = "stmp";
4088 break;
4089 case vect_mask_var:
4090 prefix = "mask";
4091 break;
4092 case vect_pointer_var:
4093 prefix = "vectp";
4094 break;
4095 default:
4096 gcc_unreachable ();
4099 if (name)
4101 char* tmp = concat (prefix, "_", name, NULL);
4102 new_vect_var = create_tmp_reg (type, tmp);
4103 free (tmp);
4105 else
4106 new_vect_var = create_tmp_reg (type, prefix);
4108 return new_vect_var;
4111 /* Like vect_get_new_vect_var but return an SSA name. */
4113 tree
4114 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4116 const char *prefix;
4117 tree new_vect_var;
4119 switch (var_kind)
4121 case vect_simple_var:
4122 prefix = "vect";
4123 break;
4124 case vect_scalar_var:
4125 prefix = "stmp";
4126 break;
4127 case vect_pointer_var:
4128 prefix = "vectp";
4129 break;
4130 default:
4131 gcc_unreachable ();
4134 if (name)
4136 char* tmp = concat (prefix, "_", name, NULL);
4137 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4138 free (tmp);
4140 else
4141 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4143 return new_vect_var;
4146 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
4148 static void
4149 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
4150 stmt_vec_info stmt_info)
4152 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
4153 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
4154 int misalign = DR_MISALIGNMENT (dr);
4155 if (misalign == -1)
4156 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4157 else
4158 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
4161 /* Function vect_create_addr_base_for_vector_ref.
4163 Create an expression that computes the address of the first memory location
4164 that will be accessed for a data reference.
4166 Input:
4167 STMT: The statement containing the data reference.
4168 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4169 OFFSET: Optional. If supplied, it is be added to the initial address.
4170 LOOP: Specify relative to which loop-nest should the address be computed.
4171 For example, when the dataref is in an inner-loop nested in an
4172 outer-loop that is now being vectorized, LOOP can be either the
4173 outer-loop, or the inner-loop. The first memory location accessed
4174 by the following dataref ('in' points to short):
4176 for (i=0; i<N; i++)
4177 for (j=0; j<M; j++)
4178 s += in[i+j]
4180 is as follows:
4181 if LOOP=i_loop: &in (relative to i_loop)
4182 if LOOP=j_loop: &in+i*2B (relative to j_loop)
4183 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
4184 initial address. Unlike OFFSET, which is number of elements to
4185 be added, BYTE_OFFSET is measured in bytes.
4187 Output:
4188 1. Return an SSA_NAME whose value is the address of the memory location of
4189 the first vector of the data reference.
4190 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4191 these statement(s) which define the returned SSA_NAME.
4193 FORNOW: We are only handling array accesses with step 1. */
4195 tree
4196 vect_create_addr_base_for_vector_ref (gimple *stmt,
4197 gimple_seq *new_stmt_list,
4198 tree offset,
4199 struct loop *loop,
4200 tree byte_offset)
4202 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4203 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4204 tree data_ref_base;
4205 const char *base_name;
4206 tree addr_base;
4207 tree dest;
4208 gimple_seq seq = NULL;
4209 tree base_offset;
4210 tree init;
4211 tree vect_ptr_type;
4212 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4213 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4215 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4217 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4219 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4221 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4222 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4223 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4225 else
4227 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4228 base_offset = unshare_expr (DR_OFFSET (dr));
4229 init = unshare_expr (DR_INIT (dr));
4232 if (loop_vinfo)
4233 base_name = get_name (data_ref_base);
4234 else
4236 base_offset = ssize_int (0);
4237 init = ssize_int (0);
4238 base_name = get_name (DR_REF (dr));
4241 /* Create base_offset */
4242 base_offset = size_binop (PLUS_EXPR,
4243 fold_convert (sizetype, base_offset),
4244 fold_convert (sizetype, init));
4246 if (offset)
4248 offset = fold_build2 (MULT_EXPR, sizetype,
4249 fold_convert (sizetype, offset), step);
4250 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4251 base_offset, offset);
4253 if (byte_offset)
4255 byte_offset = fold_convert (sizetype, byte_offset);
4256 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4257 base_offset, byte_offset);
4260 /* base + base_offset */
4261 if (loop_vinfo)
4262 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4263 else
4265 addr_base = build1 (ADDR_EXPR,
4266 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4267 unshare_expr (DR_REF (dr)));
4270 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4271 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4272 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4273 gimple_seq_add_seq (new_stmt_list, seq);
4275 if (DR_PTR_INFO (dr)
4276 && TREE_CODE (addr_base) == SSA_NAME
4277 && !SSA_NAME_PTR_INFO (addr_base))
4279 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4280 if (offset || byte_offset)
4281 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4284 if (dump_enabled_p ())
4286 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4287 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4288 dump_printf (MSG_NOTE, "\n");
4291 return addr_base;
4295 /* Function vect_create_data_ref_ptr.
4297 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4298 location accessed in the loop by STMT, along with the def-use update
4299 chain to appropriately advance the pointer through the loop iterations.
4300 Also set aliasing information for the pointer. This pointer is used by
4301 the callers to this function to create a memory reference expression for
4302 vector load/store access.
4304 Input:
4305 1. STMT: a stmt that references memory. Expected to be of the form
4306 GIMPLE_ASSIGN <name, data-ref> or
4307 GIMPLE_ASSIGN <data-ref, name>.
4308 2. AGGR_TYPE: the type of the reference, which should be either a vector
4309 or an array.
4310 3. AT_LOOP: the loop where the vector memref is to be created.
4311 4. OFFSET (optional): an offset to be added to the initial address accessed
4312 by the data-ref in STMT.
4313 5. BSI: location where the new stmts are to be placed if there is no loop
4314 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4315 pointing to the initial address.
4316 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4317 to the initial address accessed by the data-ref in STMT. This is
4318 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4319 in bytes.
4321 Output:
4322 1. Declare a new ptr to vector_type, and have it point to the base of the
4323 data reference (initial addressed accessed by the data reference).
4324 For example, for vector of type V8HI, the following code is generated:
4326 v8hi *ap;
4327 ap = (v8hi *)initial_address;
4329 if OFFSET is not supplied:
4330 initial_address = &a[init];
4331 if OFFSET is supplied:
4332 initial_address = &a[init + OFFSET];
4333 if BYTE_OFFSET is supplied:
4334 initial_address = &a[init] + BYTE_OFFSET;
4336 Return the initial_address in INITIAL_ADDRESS.
4338 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4339 update the pointer in each iteration of the loop.
4341 Return the increment stmt that updates the pointer in PTR_INCR.
4343 3. Set INV_P to true if the access pattern of the data reference in the
4344 vectorized loop is invariant. Set it to false otherwise.
4346 4. Return the pointer. */
4348 tree
4349 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4350 tree offset, tree *initial_address,
4351 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4352 bool only_init, bool *inv_p, tree byte_offset)
4354 const char *base_name;
4355 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4356 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4357 struct loop *loop = NULL;
4358 bool nested_in_vect_loop = false;
4359 struct loop *containing_loop = NULL;
4360 tree aggr_ptr_type;
4361 tree aggr_ptr;
4362 tree new_temp;
4363 gimple_seq new_stmt_list = NULL;
4364 edge pe = NULL;
4365 basic_block new_bb;
4366 tree aggr_ptr_init;
4367 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4368 tree aptr;
4369 gimple_stmt_iterator incr_gsi;
4370 bool insert_after;
4371 tree indx_before_incr, indx_after_incr;
4372 gimple *incr;
4373 tree step;
4374 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4376 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4377 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4379 if (loop_vinfo)
4381 loop = LOOP_VINFO_LOOP (loop_vinfo);
4382 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4383 containing_loop = (gimple_bb (stmt))->loop_father;
4384 pe = loop_preheader_edge (loop);
4386 else
4388 gcc_assert (bb_vinfo);
4389 only_init = true;
4390 *ptr_incr = NULL;
4393 /* Check the step (evolution) of the load in LOOP, and record
4394 whether it's invariant. */
4395 if (nested_in_vect_loop)
4396 step = STMT_VINFO_DR_STEP (stmt_info);
4397 else
4398 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4400 if (integer_zerop (step))
4401 *inv_p = true;
4402 else
4403 *inv_p = false;
4405 /* Create an expression for the first address accessed by this load
4406 in LOOP. */
4407 base_name = get_name (DR_BASE_ADDRESS (dr));
4409 if (dump_enabled_p ())
4411 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4412 dump_printf_loc (MSG_NOTE, vect_location,
4413 "create %s-pointer variable to type: ",
4414 get_tree_code_name (TREE_CODE (aggr_type)));
4415 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4416 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4417 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4418 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4419 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4420 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4421 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4422 else
4423 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4424 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4425 dump_printf (MSG_NOTE, "\n");
4428 /* (1) Create the new aggregate-pointer variable.
4429 Vector and array types inherit the alias set of their component
4430 type by default so we need to use a ref-all pointer if the data
4431 reference does not conflict with the created aggregated data
4432 reference because it is not addressable. */
4433 bool need_ref_all = false;
4434 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4435 get_alias_set (DR_REF (dr))))
4436 need_ref_all = true;
4437 /* Likewise for any of the data references in the stmt group. */
4438 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4440 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4443 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4444 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4445 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4446 get_alias_set (DR_REF (sdr))))
4448 need_ref_all = true;
4449 break;
4451 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4453 while (orig_stmt);
4455 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4456 need_ref_all);
4457 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4460 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4461 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4462 def-use update cycles for the pointer: one relative to the outer-loop
4463 (LOOP), which is what steps (3) and (4) below do. The other is relative
4464 to the inner-loop (which is the inner-most loop containing the dataref),
4465 and this is done be step (5) below.
4467 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4468 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4469 redundant. Steps (3),(4) create the following:
4471 vp0 = &base_addr;
4472 LOOP: vp1 = phi(vp0,vp2)
4475 vp2 = vp1 + step
4476 goto LOOP
4478 If there is an inner-loop nested in loop, then step (5) will also be
4479 applied, and an additional update in the inner-loop will be created:
4481 vp0 = &base_addr;
4482 LOOP: vp1 = phi(vp0,vp2)
4484 inner: vp3 = phi(vp1,vp4)
4485 vp4 = vp3 + inner_step
4486 if () goto inner
4488 vp2 = vp1 + step
4489 if () goto LOOP */
4491 /* (2) Calculate the initial address of the aggregate-pointer, and set
4492 the aggregate-pointer to point to it before the loop. */
4494 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4496 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4497 offset, loop, byte_offset);
4498 if (new_stmt_list)
4500 if (pe)
4502 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4503 gcc_assert (!new_bb);
4505 else
4506 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4509 *initial_address = new_temp;
4510 aggr_ptr_init = new_temp;
4512 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4513 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4514 inner-loop nested in LOOP (during outer-loop vectorization). */
4516 /* No update in loop is required. */
4517 if (only_init && (!loop_vinfo || at_loop == loop))
4518 aptr = aggr_ptr_init;
4519 else
4521 /* The step of the aggregate pointer is the type size. */
4522 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4523 /* One exception to the above is when the scalar step of the load in
4524 LOOP is zero. In this case the step here is also zero. */
4525 if (*inv_p)
4526 iv_step = size_zero_node;
4527 else if (tree_int_cst_sgn (step) == -1)
4528 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4530 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4532 create_iv (aggr_ptr_init,
4533 fold_convert (aggr_ptr_type, iv_step),
4534 aggr_ptr, loop, &incr_gsi, insert_after,
4535 &indx_before_incr, &indx_after_incr);
4536 incr = gsi_stmt (incr_gsi);
4537 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4539 /* Copy the points-to information if it exists. */
4540 if (DR_PTR_INFO (dr))
4542 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4543 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4545 if (ptr_incr)
4546 *ptr_incr = incr;
4548 aptr = indx_before_incr;
4551 if (!nested_in_vect_loop || only_init)
4552 return aptr;
4555 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4556 nested in LOOP, if exists. */
4558 gcc_assert (nested_in_vect_loop);
4559 if (!only_init)
4561 standard_iv_increment_position (containing_loop, &incr_gsi,
4562 &insert_after);
4563 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4564 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4565 &indx_after_incr);
4566 incr = gsi_stmt (incr_gsi);
4567 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4569 /* Copy the points-to information if it exists. */
4570 if (DR_PTR_INFO (dr))
4572 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4573 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4575 if (ptr_incr)
4576 *ptr_incr = incr;
4578 return indx_before_incr;
4580 else
4581 gcc_unreachable ();
4585 /* Function bump_vector_ptr
4587 Increment a pointer (to a vector type) by vector-size. If requested,
4588 i.e. if PTR-INCR is given, then also connect the new increment stmt
4589 to the existing def-use update-chain of the pointer, by modifying
4590 the PTR_INCR as illustrated below:
4592 The pointer def-use update-chain before this function:
4593 DATAREF_PTR = phi (p_0, p_2)
4594 ....
4595 PTR_INCR: p_2 = DATAREF_PTR + step
4597 The pointer def-use update-chain after this function:
4598 DATAREF_PTR = phi (p_0, p_2)
4599 ....
4600 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4601 ....
4602 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4604 Input:
4605 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4606 in the loop.
4607 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4608 the loop. The increment amount across iterations is expected
4609 to be vector_size.
4610 BSI - location where the new update stmt is to be placed.
4611 STMT - the original scalar memory-access stmt that is being vectorized.
4612 BUMP - optional. The offset by which to bump the pointer. If not given,
4613 the offset is assumed to be vector_size.
4615 Output: Return NEW_DATAREF_PTR as illustrated above.
4619 tree
4620 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4621 gimple *stmt, tree bump)
4623 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4624 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4625 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4626 tree update = TYPE_SIZE_UNIT (vectype);
4627 gassign *incr_stmt;
4628 ssa_op_iter iter;
4629 use_operand_p use_p;
4630 tree new_dataref_ptr;
4632 if (bump)
4633 update = bump;
4635 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4636 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4637 else
4638 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4639 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4640 dataref_ptr, update);
4641 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4643 /* Copy the points-to information if it exists. */
4644 if (DR_PTR_INFO (dr))
4646 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4647 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4650 if (!ptr_incr)
4651 return new_dataref_ptr;
4653 /* Update the vector-pointer's cross-iteration increment. */
4654 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4656 tree use = USE_FROM_PTR (use_p);
4658 if (use == dataref_ptr)
4659 SET_USE (use_p, new_dataref_ptr);
4660 else
4661 gcc_assert (tree_int_cst_compare (use, update) == 0);
4664 return new_dataref_ptr;
4668 /* Function vect_create_destination_var.
4670 Create a new temporary of type VECTYPE. */
4672 tree
4673 vect_create_destination_var (tree scalar_dest, tree vectype)
4675 tree vec_dest;
4676 const char *name;
4677 char *new_name;
4678 tree type;
4679 enum vect_var_kind kind;
4681 kind = vectype
4682 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4683 ? vect_mask_var
4684 : vect_simple_var
4685 : vect_scalar_var;
4686 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4688 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4690 name = get_name (scalar_dest);
4691 if (name)
4692 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4693 else
4694 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4695 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4696 free (new_name);
4698 return vec_dest;
4701 /* Function vect_grouped_store_supported.
4703 Returns TRUE if interleave high and interleave low permutations
4704 are supported, and FALSE otherwise. */
4706 bool
4707 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4709 machine_mode mode = TYPE_MODE (vectype);
4711 /* vect_permute_store_chain requires the group size to be equal to 3 or
4712 be a power of two. */
4713 if (count != 3 && exact_log2 (count) == -1)
4715 if (dump_enabled_p ())
4716 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4717 "the size of the group of accesses"
4718 " is not a power of 2 or not eqaul to 3\n");
4719 return false;
4722 /* Check that the permutation is supported. */
4723 if (VECTOR_MODE_P (mode))
4725 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4726 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4728 if (count == 3)
4730 unsigned int j0 = 0, j1 = 0, j2 = 0;
4731 unsigned int i, j;
4733 for (j = 0; j < 3; j++)
4735 int nelt0 = ((3 - j) * nelt) % 3;
4736 int nelt1 = ((3 - j) * nelt + 1) % 3;
4737 int nelt2 = ((3 - j) * nelt + 2) % 3;
4738 for (i = 0; i < nelt; i++)
4740 if (3 * i + nelt0 < nelt)
4741 sel[3 * i + nelt0] = j0++;
4742 if (3 * i + nelt1 < nelt)
4743 sel[3 * i + nelt1] = nelt + j1++;
4744 if (3 * i + nelt2 < nelt)
4745 sel[3 * i + nelt2] = 0;
4747 if (!can_vec_perm_p (mode, false, sel))
4749 if (dump_enabled_p ())
4750 dump_printf (MSG_MISSED_OPTIMIZATION,
4751 "permutaion op not supported by target.\n");
4752 return false;
4755 for (i = 0; i < nelt; i++)
4757 if (3 * i + nelt0 < nelt)
4758 sel[3 * i + nelt0] = 3 * i + nelt0;
4759 if (3 * i + nelt1 < nelt)
4760 sel[3 * i + nelt1] = 3 * i + nelt1;
4761 if (3 * i + nelt2 < nelt)
4762 sel[3 * i + nelt2] = nelt + j2++;
4764 if (!can_vec_perm_p (mode, false, sel))
4766 if (dump_enabled_p ())
4767 dump_printf (MSG_MISSED_OPTIMIZATION,
4768 "permutaion op not supported by target.\n");
4769 return false;
4772 return true;
4774 else
4776 /* If length is not equal to 3 then only power of 2 is supported. */
4777 gcc_assert (pow2p_hwi (count));
4779 for (i = 0; i < nelt / 2; i++)
4781 sel[i * 2] = i;
4782 sel[i * 2 + 1] = i + nelt;
4784 if (can_vec_perm_p (mode, false, sel))
4786 for (i = 0; i < nelt; i++)
4787 sel[i] += nelt / 2;
4788 if (can_vec_perm_p (mode, false, sel))
4789 return true;
4794 if (dump_enabled_p ())
4795 dump_printf (MSG_MISSED_OPTIMIZATION,
4796 "permutaion op not supported by target.\n");
4797 return false;
4801 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4802 type VECTYPE. */
4804 bool
4805 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4807 return vect_lanes_optab_supported_p ("vec_store_lanes",
4808 vec_store_lanes_optab,
4809 vectype, count);
4813 /* Function vect_permute_store_chain.
4815 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4816 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4817 the data correctly for the stores. Return the final references for stores
4818 in RESULT_CHAIN.
4820 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4821 The input is 4 vectors each containing 8 elements. We assign a number to
4822 each element, the input sequence is:
4824 1st vec: 0 1 2 3 4 5 6 7
4825 2nd vec: 8 9 10 11 12 13 14 15
4826 3rd vec: 16 17 18 19 20 21 22 23
4827 4th vec: 24 25 26 27 28 29 30 31
4829 The output sequence should be:
4831 1st vec: 0 8 16 24 1 9 17 25
4832 2nd vec: 2 10 18 26 3 11 19 27
4833 3rd vec: 4 12 20 28 5 13 21 30
4834 4th vec: 6 14 22 30 7 15 23 31
4836 i.e., we interleave the contents of the four vectors in their order.
4838 We use interleave_high/low instructions to create such output. The input of
4839 each interleave_high/low operation is two vectors:
4840 1st vec 2nd vec
4841 0 1 2 3 4 5 6 7
4842 the even elements of the result vector are obtained left-to-right from the
4843 high/low elements of the first vector. The odd elements of the result are
4844 obtained left-to-right from the high/low elements of the second vector.
4845 The output of interleave_high will be: 0 4 1 5
4846 and of interleave_low: 2 6 3 7
4849 The permutation is done in log LENGTH stages. In each stage interleave_high
4850 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4851 where the first argument is taken from the first half of DR_CHAIN and the
4852 second argument from it's second half.
4853 In our example,
4855 I1: interleave_high (1st vec, 3rd vec)
4856 I2: interleave_low (1st vec, 3rd vec)
4857 I3: interleave_high (2nd vec, 4th vec)
4858 I4: interleave_low (2nd vec, 4th vec)
4860 The output for the first stage is:
4862 I1: 0 16 1 17 2 18 3 19
4863 I2: 4 20 5 21 6 22 7 23
4864 I3: 8 24 9 25 10 26 11 27
4865 I4: 12 28 13 29 14 30 15 31
4867 The output of the second stage, i.e. the final result is:
4869 I1: 0 8 16 24 1 9 17 25
4870 I2: 2 10 18 26 3 11 19 27
4871 I3: 4 12 20 28 5 13 21 30
4872 I4: 6 14 22 30 7 15 23 31. */
4874 void
4875 vect_permute_store_chain (vec<tree> dr_chain,
4876 unsigned int length,
4877 gimple *stmt,
4878 gimple_stmt_iterator *gsi,
4879 vec<tree> *result_chain)
4881 tree vect1, vect2, high, low;
4882 gimple *perm_stmt;
4883 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4884 tree perm_mask_low, perm_mask_high;
4885 tree data_ref;
4886 tree perm3_mask_low, perm3_mask_high;
4887 unsigned int i, n, log_length = exact_log2 (length);
4888 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4889 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4891 result_chain->quick_grow (length);
4892 memcpy (result_chain->address (), dr_chain.address (),
4893 length * sizeof (tree));
4895 if (length == 3)
4897 unsigned int j0 = 0, j1 = 0, j2 = 0;
4899 for (j = 0; j < 3; j++)
4901 int nelt0 = ((3 - j) * nelt) % 3;
4902 int nelt1 = ((3 - j) * nelt + 1) % 3;
4903 int nelt2 = ((3 - j) * nelt + 2) % 3;
4905 for (i = 0; i < nelt; i++)
4907 if (3 * i + nelt0 < nelt)
4908 sel[3 * i + nelt0] = j0++;
4909 if (3 * i + nelt1 < nelt)
4910 sel[3 * i + nelt1] = nelt + j1++;
4911 if (3 * i + nelt2 < nelt)
4912 sel[3 * i + nelt2] = 0;
4914 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4916 for (i = 0; i < nelt; i++)
4918 if (3 * i + nelt0 < nelt)
4919 sel[3 * i + nelt0] = 3 * i + nelt0;
4920 if (3 * i + nelt1 < nelt)
4921 sel[3 * i + nelt1] = 3 * i + nelt1;
4922 if (3 * i + nelt2 < nelt)
4923 sel[3 * i + nelt2] = nelt + j2++;
4925 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4927 vect1 = dr_chain[0];
4928 vect2 = dr_chain[1];
4930 /* Create interleaving stmt:
4931 low = VEC_PERM_EXPR <vect1, vect2,
4932 {j, nelt, *, j + 1, nelt + j + 1, *,
4933 j + 2, nelt + j + 2, *, ...}> */
4934 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4935 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4936 vect2, perm3_mask_low);
4937 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4939 vect1 = data_ref;
4940 vect2 = dr_chain[2];
4941 /* Create interleaving stmt:
4942 low = VEC_PERM_EXPR <vect1, vect2,
4943 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4944 6, 7, nelt + j + 2, ...}> */
4945 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4946 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4947 vect2, perm3_mask_high);
4948 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4949 (*result_chain)[j] = data_ref;
4952 else
4954 /* If length is not equal to 3 then only power of 2 is supported. */
4955 gcc_assert (pow2p_hwi (length));
4957 for (i = 0, n = nelt / 2; i < n; i++)
4959 sel[i * 2] = i;
4960 sel[i * 2 + 1] = i + nelt;
4962 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4964 for (i = 0; i < nelt; i++)
4965 sel[i] += nelt / 2;
4966 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4968 for (i = 0, n = log_length; i < n; i++)
4970 for (j = 0; j < length/2; j++)
4972 vect1 = dr_chain[j];
4973 vect2 = dr_chain[j+length/2];
4975 /* Create interleaving stmt:
4976 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4977 ...}> */
4978 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4979 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4980 vect2, perm_mask_high);
4981 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4982 (*result_chain)[2*j] = high;
4984 /* Create interleaving stmt:
4985 low = VEC_PERM_EXPR <vect1, vect2,
4986 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4987 ...}> */
4988 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4989 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4990 vect2, perm_mask_low);
4991 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4992 (*result_chain)[2*j+1] = low;
4994 memcpy (dr_chain.address (), result_chain->address (),
4995 length * sizeof (tree));
5000 /* Function vect_setup_realignment
5002 This function is called when vectorizing an unaligned load using
5003 the dr_explicit_realign[_optimized] scheme.
5004 This function generates the following code at the loop prolog:
5006 p = initial_addr;
5007 x msq_init = *(floor(p)); # prolog load
5008 realignment_token = call target_builtin;
5009 loop:
5010 x msq = phi (msq_init, ---)
5012 The stmts marked with x are generated only for the case of
5013 dr_explicit_realign_optimized.
5015 The code above sets up a new (vector) pointer, pointing to the first
5016 location accessed by STMT, and a "floor-aligned" load using that pointer.
5017 It also generates code to compute the "realignment-token" (if the relevant
5018 target hook was defined), and creates a phi-node at the loop-header bb
5019 whose arguments are the result of the prolog-load (created by this
5020 function) and the result of a load that takes place in the loop (to be
5021 created by the caller to this function).
5023 For the case of dr_explicit_realign_optimized:
5024 The caller to this function uses the phi-result (msq) to create the
5025 realignment code inside the loop, and sets up the missing phi argument,
5026 as follows:
5027 loop:
5028 msq = phi (msq_init, lsq)
5029 lsq = *(floor(p')); # load in loop
5030 result = realign_load (msq, lsq, realignment_token);
5032 For the case of dr_explicit_realign:
5033 loop:
5034 msq = *(floor(p)); # load in loop
5035 p' = p + (VS-1);
5036 lsq = *(floor(p')); # load in loop
5037 result = realign_load (msq, lsq, realignment_token);
5039 Input:
5040 STMT - (scalar) load stmt to be vectorized. This load accesses
5041 a memory location that may be unaligned.
5042 BSI - place where new code is to be inserted.
5043 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5044 is used.
5046 Output:
5047 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5048 target hook, if defined.
5049 Return value - the result of the loop-header phi node. */
5051 tree
5052 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
5053 tree *realignment_token,
5054 enum dr_alignment_support alignment_support_scheme,
5055 tree init_addr,
5056 struct loop **at_loop)
5058 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5059 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5060 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5061 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
5062 struct loop *loop = NULL;
5063 edge pe = NULL;
5064 tree scalar_dest = gimple_assign_lhs (stmt);
5065 tree vec_dest;
5066 gimple *inc;
5067 tree ptr;
5068 tree data_ref;
5069 basic_block new_bb;
5070 tree msq_init = NULL_TREE;
5071 tree new_temp;
5072 gphi *phi_stmt;
5073 tree msq = NULL_TREE;
5074 gimple_seq stmts = NULL;
5075 bool inv_p;
5076 bool compute_in_loop = false;
5077 bool nested_in_vect_loop = false;
5078 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
5079 struct loop *loop_for_initial_load = NULL;
5081 if (loop_vinfo)
5083 loop = LOOP_VINFO_LOOP (loop_vinfo);
5084 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
5087 gcc_assert (alignment_support_scheme == dr_explicit_realign
5088 || alignment_support_scheme == dr_explicit_realign_optimized);
5090 /* We need to generate three things:
5091 1. the misalignment computation
5092 2. the extra vector load (for the optimized realignment scheme).
5093 3. the phi node for the two vectors from which the realignment is
5094 done (for the optimized realignment scheme). */
5096 /* 1. Determine where to generate the misalignment computation.
5098 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5099 calculation will be generated by this function, outside the loop (in the
5100 preheader). Otherwise, INIT_ADDR had already been computed for us by the
5101 caller, inside the loop.
5103 Background: If the misalignment remains fixed throughout the iterations of
5104 the loop, then both realignment schemes are applicable, and also the
5105 misalignment computation can be done outside LOOP. This is because we are
5106 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5107 are a multiple of VS (the Vector Size), and therefore the misalignment in
5108 different vectorized LOOP iterations is always the same.
5109 The problem arises only if the memory access is in an inner-loop nested
5110 inside LOOP, which is now being vectorized using outer-loop vectorization.
5111 This is the only case when the misalignment of the memory access may not
5112 remain fixed throughout the iterations of the inner-loop (as explained in
5113 detail in vect_supportable_dr_alignment). In this case, not only is the
5114 optimized realignment scheme not applicable, but also the misalignment
5115 computation (and generation of the realignment token that is passed to
5116 REALIGN_LOAD) have to be done inside the loop.
5118 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5119 or not, which in turn determines if the misalignment is computed inside
5120 the inner-loop, or outside LOOP. */
5122 if (init_addr != NULL_TREE || !loop_vinfo)
5124 compute_in_loop = true;
5125 gcc_assert (alignment_support_scheme == dr_explicit_realign);
5129 /* 2. Determine where to generate the extra vector load.
5131 For the optimized realignment scheme, instead of generating two vector
5132 loads in each iteration, we generate a single extra vector load in the
5133 preheader of the loop, and in each iteration reuse the result of the
5134 vector load from the previous iteration. In case the memory access is in
5135 an inner-loop nested inside LOOP, which is now being vectorized using
5136 outer-loop vectorization, we need to determine whether this initial vector
5137 load should be generated at the preheader of the inner-loop, or can be
5138 generated at the preheader of LOOP. If the memory access has no evolution
5139 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5140 to be generated inside LOOP (in the preheader of the inner-loop). */
5142 if (nested_in_vect_loop)
5144 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5145 bool invariant_in_outerloop =
5146 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5147 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5149 else
5150 loop_for_initial_load = loop;
5151 if (at_loop)
5152 *at_loop = loop_for_initial_load;
5154 if (loop_for_initial_load)
5155 pe = loop_preheader_edge (loop_for_initial_load);
5157 /* 3. For the case of the optimized realignment, create the first vector
5158 load at the loop preheader. */
5160 if (alignment_support_scheme == dr_explicit_realign_optimized)
5162 /* Create msq_init = *(floor(p1)) in the loop preheader */
5163 gassign *new_stmt;
5165 gcc_assert (!compute_in_loop);
5166 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5167 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
5168 NULL_TREE, &init_addr, NULL, &inc,
5169 true, &inv_p);
5170 if (TREE_CODE (ptr) == SSA_NAME)
5171 new_temp = copy_ssa_name (ptr);
5172 else
5173 new_temp = make_ssa_name (TREE_TYPE (ptr));
5174 new_stmt = gimple_build_assign
5175 (new_temp, BIT_AND_EXPR, ptr,
5176 build_int_cst (TREE_TYPE (ptr),
5177 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
5178 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5179 gcc_assert (!new_bb);
5180 data_ref
5181 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5182 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5183 new_stmt = gimple_build_assign (vec_dest, data_ref);
5184 new_temp = make_ssa_name (vec_dest, new_stmt);
5185 gimple_assign_set_lhs (new_stmt, new_temp);
5186 if (pe)
5188 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5189 gcc_assert (!new_bb);
5191 else
5192 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5194 msq_init = gimple_assign_lhs (new_stmt);
5197 /* 4. Create realignment token using a target builtin, if available.
5198 It is done either inside the containing loop, or before LOOP (as
5199 determined above). */
5201 if (targetm.vectorize.builtin_mask_for_load)
5203 gcall *new_stmt;
5204 tree builtin_decl;
5206 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5207 if (!init_addr)
5209 /* Generate the INIT_ADDR computation outside LOOP. */
5210 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5211 NULL_TREE, loop);
5212 if (loop)
5214 pe = loop_preheader_edge (loop);
5215 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5216 gcc_assert (!new_bb);
5218 else
5219 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5222 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5223 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5224 vec_dest =
5225 vect_create_destination_var (scalar_dest,
5226 gimple_call_return_type (new_stmt));
5227 new_temp = make_ssa_name (vec_dest, new_stmt);
5228 gimple_call_set_lhs (new_stmt, new_temp);
5230 if (compute_in_loop)
5231 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5232 else
5234 /* Generate the misalignment computation outside LOOP. */
5235 pe = loop_preheader_edge (loop);
5236 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5237 gcc_assert (!new_bb);
5240 *realignment_token = gimple_call_lhs (new_stmt);
5242 /* The result of the CALL_EXPR to this builtin is determined from
5243 the value of the parameter and no global variables are touched
5244 which makes the builtin a "const" function. Requiring the
5245 builtin to have the "const" attribute makes it unnecessary
5246 to call mark_call_clobbered. */
5247 gcc_assert (TREE_READONLY (builtin_decl));
5250 if (alignment_support_scheme == dr_explicit_realign)
5251 return msq;
5253 gcc_assert (!compute_in_loop);
5254 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5257 /* 5. Create msq = phi <msq_init, lsq> in loop */
5259 pe = loop_preheader_edge (containing_loop);
5260 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5261 msq = make_ssa_name (vec_dest);
5262 phi_stmt = create_phi_node (msq, containing_loop->header);
5263 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5265 return msq;
5269 /* Function vect_grouped_load_supported.
5271 COUNT is the size of the load group (the number of statements plus the
5272 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5273 only one statement, with a gap of COUNT - 1.
5275 Returns true if a suitable permute exists. */
5277 bool
5278 vect_grouped_load_supported (tree vectype, bool single_element_p,
5279 unsigned HOST_WIDE_INT count)
5281 machine_mode mode = TYPE_MODE (vectype);
5283 /* If this is single-element interleaving with an element distance
5284 that leaves unused vector loads around punt - we at least create
5285 very sub-optimal code in that case (and blow up memory,
5286 see PR65518). */
5287 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5289 if (dump_enabled_p ())
5290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5291 "single-element interleaving not supported "
5292 "for not adjacent vector loads\n");
5293 return false;
5296 /* vect_permute_load_chain requires the group size to be equal to 3 or
5297 be a power of two. */
5298 if (count != 3 && exact_log2 (count) == -1)
5300 if (dump_enabled_p ())
5301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5302 "the size of the group of accesses"
5303 " is not a power of 2 or not equal to 3\n");
5304 return false;
5307 /* Check that the permutation is supported. */
5308 if (VECTOR_MODE_P (mode))
5310 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5311 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5313 if (count == 3)
5315 unsigned int k;
5316 for (k = 0; k < 3; k++)
5318 for (i = 0; i < nelt; i++)
5319 if (3 * i + k < 2 * nelt)
5320 sel[i] = 3 * i + k;
5321 else
5322 sel[i] = 0;
5323 if (!can_vec_perm_p (mode, false, sel))
5325 if (dump_enabled_p ())
5326 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5327 "shuffle of 3 loads is not supported by"
5328 " target\n");
5329 return false;
5331 for (i = 0, j = 0; i < nelt; i++)
5332 if (3 * i + k < 2 * nelt)
5333 sel[i] = i;
5334 else
5335 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5336 if (!can_vec_perm_p (mode, false, sel))
5338 if (dump_enabled_p ())
5339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5340 "shuffle of 3 loads is not supported by"
5341 " target\n");
5342 return false;
5345 return true;
5347 else
5349 /* If length is not equal to 3 then only power of 2 is supported. */
5350 gcc_assert (pow2p_hwi (count));
5351 for (i = 0; i < nelt; i++)
5352 sel[i] = i * 2;
5353 if (can_vec_perm_p (mode, false, sel))
5355 for (i = 0; i < nelt; i++)
5356 sel[i] = i * 2 + 1;
5357 if (can_vec_perm_p (mode, false, sel))
5358 return true;
5363 if (dump_enabled_p ())
5364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5365 "extract even/odd not supported by target\n");
5366 return false;
5369 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5370 type VECTYPE. */
5372 bool
5373 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5375 return vect_lanes_optab_supported_p ("vec_load_lanes",
5376 vec_load_lanes_optab,
5377 vectype, count);
5380 /* Function vect_permute_load_chain.
5382 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5383 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5384 the input data correctly. Return the final references for loads in
5385 RESULT_CHAIN.
5387 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5388 The input is 4 vectors each containing 8 elements. We assign a number to each
5389 element, the input sequence is:
5391 1st vec: 0 1 2 3 4 5 6 7
5392 2nd vec: 8 9 10 11 12 13 14 15
5393 3rd vec: 16 17 18 19 20 21 22 23
5394 4th vec: 24 25 26 27 28 29 30 31
5396 The output sequence should be:
5398 1st vec: 0 4 8 12 16 20 24 28
5399 2nd vec: 1 5 9 13 17 21 25 29
5400 3rd vec: 2 6 10 14 18 22 26 30
5401 4th vec: 3 7 11 15 19 23 27 31
5403 i.e., the first output vector should contain the first elements of each
5404 interleaving group, etc.
5406 We use extract_even/odd instructions to create such output. The input of
5407 each extract_even/odd operation is two vectors
5408 1st vec 2nd vec
5409 0 1 2 3 4 5 6 7
5411 and the output is the vector of extracted even/odd elements. The output of
5412 extract_even will be: 0 2 4 6
5413 and of extract_odd: 1 3 5 7
5416 The permutation is done in log LENGTH stages. In each stage extract_even
5417 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5418 their order. In our example,
5420 E1: extract_even (1st vec, 2nd vec)
5421 E2: extract_odd (1st vec, 2nd vec)
5422 E3: extract_even (3rd vec, 4th vec)
5423 E4: extract_odd (3rd vec, 4th vec)
5425 The output for the first stage will be:
5427 E1: 0 2 4 6 8 10 12 14
5428 E2: 1 3 5 7 9 11 13 15
5429 E3: 16 18 20 22 24 26 28 30
5430 E4: 17 19 21 23 25 27 29 31
5432 In order to proceed and create the correct sequence for the next stage (or
5433 for the correct output, if the second stage is the last one, as in our
5434 example), we first put the output of extract_even operation and then the
5435 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5436 The input for the second stage is:
5438 1st vec (E1): 0 2 4 6 8 10 12 14
5439 2nd vec (E3): 16 18 20 22 24 26 28 30
5440 3rd vec (E2): 1 3 5 7 9 11 13 15
5441 4th vec (E4): 17 19 21 23 25 27 29 31
5443 The output of the second stage:
5445 E1: 0 4 8 12 16 20 24 28
5446 E2: 2 6 10 14 18 22 26 30
5447 E3: 1 5 9 13 17 21 25 29
5448 E4: 3 7 11 15 19 23 27 31
5450 And RESULT_CHAIN after reordering:
5452 1st vec (E1): 0 4 8 12 16 20 24 28
5453 2nd vec (E3): 1 5 9 13 17 21 25 29
5454 3rd vec (E2): 2 6 10 14 18 22 26 30
5455 4th vec (E4): 3 7 11 15 19 23 27 31. */
5457 static void
5458 vect_permute_load_chain (vec<tree> dr_chain,
5459 unsigned int length,
5460 gimple *stmt,
5461 gimple_stmt_iterator *gsi,
5462 vec<tree> *result_chain)
5464 tree data_ref, first_vect, second_vect;
5465 tree perm_mask_even, perm_mask_odd;
5466 tree perm3_mask_low, perm3_mask_high;
5467 gimple *perm_stmt;
5468 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5469 unsigned int i, j, log_length = exact_log2 (length);
5470 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5471 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5473 result_chain->quick_grow (length);
5474 memcpy (result_chain->address (), dr_chain.address (),
5475 length * sizeof (tree));
5477 if (length == 3)
5479 unsigned int k;
5481 for (k = 0; k < 3; k++)
5483 for (i = 0; i < nelt; i++)
5484 if (3 * i + k < 2 * nelt)
5485 sel[i] = 3 * i + k;
5486 else
5487 sel[i] = 0;
5488 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5490 for (i = 0, j = 0; i < nelt; i++)
5491 if (3 * i + k < 2 * nelt)
5492 sel[i] = i;
5493 else
5494 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5496 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5498 first_vect = dr_chain[0];
5499 second_vect = dr_chain[1];
5501 /* Create interleaving stmt (low part of):
5502 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5503 ...}> */
5504 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5505 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5506 second_vect, perm3_mask_low);
5507 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5509 /* Create interleaving stmt (high part of):
5510 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5511 ...}> */
5512 first_vect = data_ref;
5513 second_vect = dr_chain[2];
5514 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5515 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5516 second_vect, perm3_mask_high);
5517 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5518 (*result_chain)[k] = data_ref;
5521 else
5523 /* If length is not equal to 3 then only power of 2 is supported. */
5524 gcc_assert (pow2p_hwi (length));
5526 for (i = 0; i < nelt; ++i)
5527 sel[i] = i * 2;
5528 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5530 for (i = 0; i < nelt; ++i)
5531 sel[i] = i * 2 + 1;
5532 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5534 for (i = 0; i < log_length; i++)
5536 for (j = 0; j < length; j += 2)
5538 first_vect = dr_chain[j];
5539 second_vect = dr_chain[j+1];
5541 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5542 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5543 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5544 first_vect, second_vect,
5545 perm_mask_even);
5546 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5547 (*result_chain)[j/2] = data_ref;
5549 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5550 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5551 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5552 first_vect, second_vect,
5553 perm_mask_odd);
5554 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5555 (*result_chain)[j/2+length/2] = data_ref;
5557 memcpy (dr_chain.address (), result_chain->address (),
5558 length * sizeof (tree));
5563 /* Function vect_shift_permute_load_chain.
5565 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5566 sequence of stmts to reorder the input data accordingly.
5567 Return the final references for loads in RESULT_CHAIN.
5568 Return true if successed, false otherwise.
5570 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5571 The input is 3 vectors each containing 8 elements. We assign a
5572 number to each element, the input sequence is:
5574 1st vec: 0 1 2 3 4 5 6 7
5575 2nd vec: 8 9 10 11 12 13 14 15
5576 3rd vec: 16 17 18 19 20 21 22 23
5578 The output sequence should be:
5580 1st vec: 0 3 6 9 12 15 18 21
5581 2nd vec: 1 4 7 10 13 16 19 22
5582 3rd vec: 2 5 8 11 14 17 20 23
5584 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5586 First we shuffle all 3 vectors to get correct elements order:
5588 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5589 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5590 3rd vec: (16 19 22) (17 20 23) (18 21)
5592 Next we unite and shift vector 3 times:
5594 1st step:
5595 shift right by 6 the concatenation of:
5596 "1st vec" and "2nd vec"
5597 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5598 "2nd vec" and "3rd vec"
5599 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5600 "3rd vec" and "1st vec"
5601 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5602 | New vectors |
5604 So that now new vectors are:
5606 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5607 2nd vec: (10 13) (16 19 22) (17 20 23)
5608 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5610 2nd step:
5611 shift right by 5 the concatenation of:
5612 "1st vec" and "3rd vec"
5613 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5614 "2nd vec" and "1st vec"
5615 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5616 "3rd vec" and "2nd vec"
5617 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5618 | New vectors |
5620 So that now new vectors are:
5622 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5623 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5624 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5626 3rd step:
5627 shift right by 5 the concatenation of:
5628 "1st vec" and "1st vec"
5629 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5630 shift right by 3 the concatenation of:
5631 "2nd vec" and "2nd vec"
5632 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5633 | New vectors |
5635 So that now all vectors are READY:
5636 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5637 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5638 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5640 This algorithm is faster than one in vect_permute_load_chain if:
5641 1. "shift of a concatination" is faster than general permutation.
5642 This is usually so.
5643 2. The TARGET machine can't execute vector instructions in parallel.
5644 This is because each step of the algorithm depends on previous.
5645 The algorithm in vect_permute_load_chain is much more parallel.
5647 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5650 static bool
5651 vect_shift_permute_load_chain (vec<tree> dr_chain,
5652 unsigned int length,
5653 gimple *stmt,
5654 gimple_stmt_iterator *gsi,
5655 vec<tree> *result_chain)
5657 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5658 tree perm2_mask1, perm2_mask2, perm3_mask;
5659 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5660 gimple *perm_stmt;
5662 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5663 unsigned int i;
5664 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5665 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5666 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5667 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5669 result_chain->quick_grow (length);
5670 memcpy (result_chain->address (), dr_chain.address (),
5671 length * sizeof (tree));
5673 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5675 unsigned int j, log_length = exact_log2 (length);
5676 for (i = 0; i < nelt / 2; ++i)
5677 sel[i] = i * 2;
5678 for (i = 0; i < nelt / 2; ++i)
5679 sel[nelt / 2 + i] = i * 2 + 1;
5680 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5682 if (dump_enabled_p ())
5683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5684 "shuffle of 2 fields structure is not \
5685 supported by target\n");
5686 return false;
5688 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5690 for (i = 0; i < nelt / 2; ++i)
5691 sel[i] = i * 2 + 1;
5692 for (i = 0; i < nelt / 2; ++i)
5693 sel[nelt / 2 + i] = i * 2;
5694 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5696 if (dump_enabled_p ())
5697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5698 "shuffle of 2 fields structure is not \
5699 supported by target\n");
5700 return false;
5702 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5704 /* Generating permutation constant to shift all elements.
5705 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5706 for (i = 0; i < nelt; i++)
5707 sel[i] = nelt / 2 + i;
5708 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5710 if (dump_enabled_p ())
5711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5712 "shift permutation is not supported by target\n");
5713 return false;
5715 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5717 /* Generating permutation constant to select vector from 2.
5718 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5719 for (i = 0; i < nelt / 2; i++)
5720 sel[i] = i;
5721 for (i = nelt / 2; i < nelt; i++)
5722 sel[i] = nelt + i;
5723 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5725 if (dump_enabled_p ())
5726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5727 "select is not supported by target\n");
5728 return false;
5730 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5732 for (i = 0; i < log_length; i++)
5734 for (j = 0; j < length; j += 2)
5736 first_vect = dr_chain[j];
5737 second_vect = dr_chain[j + 1];
5739 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5740 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5741 first_vect, first_vect,
5742 perm2_mask1);
5743 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5744 vect[0] = data_ref;
5746 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5747 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5748 second_vect, second_vect,
5749 perm2_mask2);
5750 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5751 vect[1] = data_ref;
5753 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5754 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5755 vect[0], vect[1], shift1_mask);
5756 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5757 (*result_chain)[j/2 + length/2] = data_ref;
5759 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5760 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5761 vect[0], vect[1], select_mask);
5762 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5763 (*result_chain)[j/2] = data_ref;
5765 memcpy (dr_chain.address (), result_chain->address (),
5766 length * sizeof (tree));
5768 return true;
5770 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5772 unsigned int k = 0, l = 0;
5774 /* Generating permutation constant to get all elements in rigth order.
5775 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5776 for (i = 0; i < nelt; i++)
5778 if (3 * k + (l % 3) >= nelt)
5780 k = 0;
5781 l += (3 - (nelt % 3));
5783 sel[i] = 3 * k + (l % 3);
5784 k++;
5786 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5788 if (dump_enabled_p ())
5789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5790 "shuffle of 3 fields structure is not \
5791 supported by target\n");
5792 return false;
5794 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5796 /* Generating permutation constant to shift all elements.
5797 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5798 for (i = 0; i < nelt; i++)
5799 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5800 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5802 if (dump_enabled_p ())
5803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5804 "shift permutation is not supported by target\n");
5805 return false;
5807 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5809 /* Generating permutation constant to shift all elements.
5810 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5811 for (i = 0; i < nelt; i++)
5812 sel[i] = 2 * (nelt / 3) + 1 + i;
5813 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5815 if (dump_enabled_p ())
5816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5817 "shift permutation is not supported by target\n");
5818 return false;
5820 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5822 /* Generating permutation constant to shift all elements.
5823 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5824 for (i = 0; i < nelt; i++)
5825 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5826 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5828 if (dump_enabled_p ())
5829 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5830 "shift permutation is not supported by target\n");
5831 return false;
5833 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5835 /* Generating permutation constant to shift all elements.
5836 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5837 for (i = 0; i < nelt; i++)
5838 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5839 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5841 if (dump_enabled_p ())
5842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5843 "shift permutation is not supported by target\n");
5844 return false;
5846 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5848 for (k = 0; k < 3; k++)
5850 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5851 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5852 dr_chain[k], dr_chain[k],
5853 perm3_mask);
5854 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5855 vect[k] = data_ref;
5858 for (k = 0; k < 3; k++)
5860 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5861 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5862 vect[k % 3], vect[(k + 1) % 3],
5863 shift1_mask);
5864 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5865 vect_shift[k] = data_ref;
5868 for (k = 0; k < 3; k++)
5870 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5871 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5872 vect_shift[(4 - k) % 3],
5873 vect_shift[(3 - k) % 3],
5874 shift2_mask);
5875 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5876 vect[k] = data_ref;
5879 (*result_chain)[3 - (nelt % 3)] = vect[2];
5881 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5882 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5883 vect[0], shift3_mask);
5884 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5885 (*result_chain)[nelt % 3] = data_ref;
5887 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5888 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5889 vect[1], shift4_mask);
5890 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5891 (*result_chain)[0] = data_ref;
5892 return true;
5894 return false;
5897 /* Function vect_transform_grouped_load.
5899 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5900 to perform their permutation and ascribe the result vectorized statements to
5901 the scalar statements.
5904 void
5905 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5906 gimple_stmt_iterator *gsi)
5908 machine_mode mode;
5909 vec<tree> result_chain = vNULL;
5911 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5912 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5913 vectors, that are ready for vector computation. */
5914 result_chain.create (size);
5916 /* If reassociation width for vector type is 2 or greater target machine can
5917 execute 2 or more vector instructions in parallel. Otherwise try to
5918 get chain for loads group using vect_shift_permute_load_chain. */
5919 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5920 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5921 || pow2p_hwi (size)
5922 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5923 gsi, &result_chain))
5924 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5925 vect_record_grouped_load_vectors (stmt, result_chain);
5926 result_chain.release ();
5929 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5930 generated as part of the vectorization of STMT. Assign the statement
5931 for each vector to the associated scalar statement. */
5933 void
5934 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5936 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5937 gimple *next_stmt, *new_stmt;
5938 unsigned int i, gap_count;
5939 tree tmp_data_ref;
5941 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5942 Since we scan the chain starting from it's first node, their order
5943 corresponds the order of data-refs in RESULT_CHAIN. */
5944 next_stmt = first_stmt;
5945 gap_count = 1;
5946 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5948 if (!next_stmt)
5949 break;
5951 /* Skip the gaps. Loads created for the gaps will be removed by dead
5952 code elimination pass later. No need to check for the first stmt in
5953 the group, since it always exists.
5954 GROUP_GAP is the number of steps in elements from the previous
5955 access (if there is no gap GROUP_GAP is 1). We skip loads that
5956 correspond to the gaps. */
5957 if (next_stmt != first_stmt
5958 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5960 gap_count++;
5961 continue;
5964 while (next_stmt)
5966 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5967 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5968 copies, and we put the new vector statement in the first available
5969 RELATED_STMT. */
5970 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5971 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5972 else
5974 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5976 gimple *prev_stmt =
5977 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5978 gimple *rel_stmt =
5979 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5980 while (rel_stmt)
5982 prev_stmt = rel_stmt;
5983 rel_stmt =
5984 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5987 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5988 new_stmt;
5992 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5993 gap_count = 1;
5994 /* If NEXT_STMT accesses the same DR as the previous statement,
5995 put the same TMP_DATA_REF as its vectorized statement; otherwise
5996 get the next data-ref from RESULT_CHAIN. */
5997 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5998 break;
6003 /* Function vect_force_dr_alignment_p.
6005 Returns whether the alignment of a DECL can be forced to be aligned
6006 on ALIGNMENT bit boundary. */
6008 bool
6009 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6011 if (!VAR_P (decl))
6012 return false;
6014 if (decl_in_symtab_p (decl)
6015 && !symtab_node::get (decl)->can_increase_alignment_p ())
6016 return false;
6018 if (TREE_STATIC (decl))
6019 return (alignment <= MAX_OFILE_ALIGNMENT);
6020 else
6021 return (alignment <= MAX_STACK_ALIGNMENT);
6025 /* Return whether the data reference DR is supported with respect to its
6026 alignment.
6027 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6028 it is aligned, i.e., check if it is possible to vectorize it with different
6029 alignment. */
6031 enum dr_alignment_support
6032 vect_supportable_dr_alignment (struct data_reference *dr,
6033 bool check_aligned_accesses)
6035 gimple *stmt = DR_STMT (dr);
6036 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6037 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6038 machine_mode mode = TYPE_MODE (vectype);
6039 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6040 struct loop *vect_loop = NULL;
6041 bool nested_in_vect_loop = false;
6043 if (aligned_access_p (dr) && !check_aligned_accesses)
6044 return dr_aligned;
6046 /* For now assume all conditional loads/stores support unaligned
6047 access without any special code. */
6048 if (is_gimple_call (stmt)
6049 && gimple_call_internal_p (stmt)
6050 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6051 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6052 return dr_unaligned_supported;
6054 if (loop_vinfo)
6056 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6057 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
6060 /* Possibly unaligned access. */
6062 /* We can choose between using the implicit realignment scheme (generating
6063 a misaligned_move stmt) and the explicit realignment scheme (generating
6064 aligned loads with a REALIGN_LOAD). There are two variants to the
6065 explicit realignment scheme: optimized, and unoptimized.
6066 We can optimize the realignment only if the step between consecutive
6067 vector loads is equal to the vector size. Since the vector memory
6068 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6069 is guaranteed that the misalignment amount remains the same throughout the
6070 execution of the vectorized loop. Therefore, we can create the
6071 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6072 at the loop preheader.
6074 However, in the case of outer-loop vectorization, when vectorizing a
6075 memory access in the inner-loop nested within the LOOP that is now being
6076 vectorized, while it is guaranteed that the misalignment of the
6077 vectorized memory access will remain the same in different outer-loop
6078 iterations, it is *not* guaranteed that is will remain the same throughout
6079 the execution of the inner-loop. This is because the inner-loop advances
6080 with the original scalar step (and not in steps of VS). If the inner-loop
6081 step happens to be a multiple of VS, then the misalignment remains fixed
6082 and we can use the optimized realignment scheme. For example:
6084 for (i=0; i<N; i++)
6085 for (j=0; j<M; j++)
6086 s += a[i+j];
6088 When vectorizing the i-loop in the above example, the step between
6089 consecutive vector loads is 1, and so the misalignment does not remain
6090 fixed across the execution of the inner-loop, and the realignment cannot
6091 be optimized (as illustrated in the following pseudo vectorized loop):
6093 for (i=0; i<N; i+=4)
6094 for (j=0; j<M; j++){
6095 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6096 // when j is {0,1,2,3,4,5,6,7,...} respectively.
6097 // (assuming that we start from an aligned address).
6100 We therefore have to use the unoptimized realignment scheme:
6102 for (i=0; i<N; i+=4)
6103 for (j=k; j<M; j+=4)
6104 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6105 // that the misalignment of the initial address is
6106 // 0).
6108 The loop can then be vectorized as follows:
6110 for (k=0; k<4; k++){
6111 rt = get_realignment_token (&vp[k]);
6112 for (i=0; i<N; i+=4){
6113 v1 = vp[i+k];
6114 for (j=k; j<M; j+=4){
6115 v2 = vp[i+j+VS-1];
6116 va = REALIGN_LOAD <v1,v2,rt>;
6117 vs += va;
6118 v1 = v2;
6121 } */
6123 if (DR_IS_READ (dr))
6125 bool is_packed = false;
6126 tree type = (TREE_TYPE (DR_REF (dr)));
6128 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6129 && (!targetm.vectorize.builtin_mask_for_load
6130 || targetm.vectorize.builtin_mask_for_load ()))
6132 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6134 /* If we are doing SLP then the accesses need not have the
6135 same alignment, instead it depends on the SLP group size. */
6136 if (loop_vinfo
6137 && STMT_SLP_TYPE (stmt_info)
6138 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6139 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
6140 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
6142 else if (!loop_vinfo
6143 || (nested_in_vect_loop
6144 && (TREE_INT_CST_LOW (DR_STEP (dr))
6145 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
6146 return dr_explicit_realign;
6147 else
6148 return dr_explicit_realign_optimized;
6150 if (!known_alignment_for_access_p (dr))
6151 is_packed = not_size_aligned (DR_REF (dr));
6153 if (targetm.vectorize.support_vector_misalignment
6154 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6155 /* Can't software pipeline the loads, but can at least do them. */
6156 return dr_unaligned_supported;
6158 else
6160 bool is_packed = false;
6161 tree type = (TREE_TYPE (DR_REF (dr)));
6163 if (!known_alignment_for_access_p (dr))
6164 is_packed = not_size_aligned (DR_REF (dr));
6166 if (targetm.vectorize.support_vector_misalignment
6167 (mode, type, DR_MISALIGNMENT (dr), is_packed))
6168 return dr_unaligned_supported;
6171 /* Unsupported. */
6172 return dr_unaligned_unsupported;