Alternative check for vector refs with same alignment
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobe8e2658e2183f48c3895f9a52bc93a6f83016971
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "memmodel.h"
32 #include "tm_p.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "cgraph.h"
36 #include "dumpfile.h"
37 #include "alias.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "tree-eh.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop.h"
47 #include "cfgloop.h"
48 #include "tree-scalar-evolution.h"
49 #include "tree-vectorizer.h"
50 #include "expr.h"
51 #include "builtins.h"
52 #include "params.h"
54 /* Return true if load- or store-lanes optab OPTAB is implemented for
55 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
57 static bool
58 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
59 tree vectype, unsigned HOST_WIDE_INT count)
61 machine_mode mode, array_mode;
62 bool limit_p;
64 mode = TYPE_MODE (vectype);
65 limit_p = !targetm.array_mode_supported_p (mode, count);
66 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
67 MODE_INT, limit_p);
69 if (array_mode == BLKmode)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
74 GET_MODE_NAME (mode), count);
75 return false;
78 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
80 if (dump_enabled_p ())
81 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
82 "cannot use %s<%s><%s>\n", name,
83 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
84 return false;
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_NOTE, vect_location,
89 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
90 GET_MODE_NAME (mode));
92 return true;
96 /* Return the smallest scalar part of STMT.
97 This is used to determine the vectype of the stmt. We generally set the
98 vectype according to the type of the result (lhs). For stmts whose
99 result-type is different than the type of the arguments (e.g., demotion,
100 promotion), vectype will be reset appropriately (later). Note that we have
101 to visit the smallest datatype in this function, because that determines the
102 VF. If the smallest datatype in the loop is present only as the rhs of a
103 promotion operation - we'd miss it.
104 Such a case, where a variable of this datatype does not appear in the lhs
105 anywhere in the loop, can only occur if it's an invariant: e.g.:
106 'int_x = (int) short_inv', which we'd expect to have been optimized away by
107 invariant motion. However, we cannot rely on invariant motion to always
108 take invariants out of the loop, and so in the case of promotion we also
109 have to check the rhs.
110 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
111 types. */
113 tree
114 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
115 HOST_WIDE_INT *rhs_size_unit)
117 tree scalar_type = gimple_expr_type (stmt);
118 HOST_WIDE_INT lhs, rhs;
120 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
122 if (is_gimple_assign (stmt)
123 && (gimple_assign_cast_p (stmt)
124 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
125 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
126 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
128 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
130 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
131 if (rhs < lhs)
132 scalar_type = rhs_type;
135 *lhs_size_unit = lhs;
136 *rhs_size_unit = rhs;
137 return scalar_type;
141 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
142 tested at run-time. Return TRUE if DDR was successfully inserted.
143 Return false if versioning is not supported. */
145 static bool
146 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
148 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
150 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
151 return false;
153 if (dump_enabled_p ())
155 dump_printf_loc (MSG_NOTE, vect_location,
156 "mark for run-time aliasing test between ");
157 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
158 dump_printf (MSG_NOTE, " and ");
159 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
160 dump_printf (MSG_NOTE, "\n");
163 if (optimize_loop_nest_for_size_p (loop))
165 if (dump_enabled_p ())
166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
167 "versioning not supported when optimizing"
168 " for size.\n");
169 return false;
172 /* FORNOW: We don't support versioning with outer-loop vectorization. */
173 if (loop->inner)
175 if (dump_enabled_p ())
176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
177 "versioning not yet supported for outer-loops.\n");
178 return false;
181 /* FORNOW: We don't support creating runtime alias tests for non-constant
182 step. */
183 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
184 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
186 if (dump_enabled_p ())
187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
188 "versioning not yet supported for non-constant "
189 "step\n");
190 return false;
193 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
194 return true;
198 /* Function vect_analyze_data_ref_dependence.
200 Return TRUE if there (might) exist a dependence between a memory-reference
201 DRA and a memory-reference DRB. When versioning for alias may check a
202 dependence at run-time, return FALSE. Adjust *MAX_VF according to
203 the data dependence. */
205 static bool
206 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
207 loop_vec_info loop_vinfo, int *max_vf)
209 unsigned int i;
210 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
211 struct data_reference *dra = DDR_A (ddr);
212 struct data_reference *drb = DDR_B (ddr);
213 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
214 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
215 lambda_vector dist_v;
216 unsigned int loop_depth;
218 /* In loop analysis all data references should be vectorizable. */
219 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
220 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
221 gcc_unreachable ();
223 /* Independent data accesses. */
224 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
225 return false;
227 if (dra == drb
228 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
229 return false;
231 /* We do not have to consider dependences between accesses that belong
232 to the same group. */
233 if (GROUP_FIRST_ELEMENT (stmtinfo_a)
234 && GROUP_FIRST_ELEMENT (stmtinfo_a) == GROUP_FIRST_ELEMENT (stmtinfo_b))
235 return false;
237 /* Even if we have an anti-dependence then, as the vectorized loop covers at
238 least two scalar iterations, there is always also a true dependence.
239 As the vectorizer does not re-order loads and stores we can ignore
240 the anti-dependence if TBAA can disambiguate both DRs similar to the
241 case with known negative distance anti-dependences (positive
242 distance anti-dependences would violate TBAA constraints). */
243 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
244 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
245 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
246 get_alias_set (DR_REF (drb))))
247 return false;
249 /* Unknown data dependence. */
250 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
252 /* If user asserted safelen consecutive iterations can be
253 executed concurrently, assume independence. */
254 if (loop->safelen >= 2)
256 if (loop->safelen < *max_vf)
257 *max_vf = loop->safelen;
258 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
259 return false;
262 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
263 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
265 if (dump_enabled_p ())
267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
268 "versioning for alias not supported for: "
269 "can't determine dependence between ");
270 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
271 DR_REF (dra));
272 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
273 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
274 DR_REF (drb));
275 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
277 return true;
280 if (dump_enabled_p ())
282 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
283 "versioning for alias required: "
284 "can't determine dependence between ");
285 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
286 DR_REF (dra));
287 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
288 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
289 DR_REF (drb));
290 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
293 /* Add to list of ddrs that need to be tested at run-time. */
294 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
297 /* Known data dependence. */
298 if (DDR_NUM_DIST_VECTS (ddr) == 0)
300 /* If user asserted safelen consecutive iterations can be
301 executed concurrently, assume independence. */
302 if (loop->safelen >= 2)
304 if (loop->safelen < *max_vf)
305 *max_vf = loop->safelen;
306 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
307 return false;
310 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
311 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
313 if (dump_enabled_p ())
315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
316 "versioning for alias not supported for: "
317 "bad dist vector for ");
318 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
319 DR_REF (dra));
320 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
321 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
322 DR_REF (drb));
323 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
325 return true;
328 if (dump_enabled_p ())
330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
331 "versioning for alias required: "
332 "bad dist vector for ");
333 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
334 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
335 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
336 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
338 /* Add to list of ddrs that need to be tested at run-time. */
339 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
342 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
343 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
345 int dist = dist_v[loop_depth];
347 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location,
349 "dependence distance = %d.\n", dist);
351 if (dist == 0)
353 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE, vect_location,
356 "dependence distance == 0 between ");
357 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
358 dump_printf (MSG_NOTE, " and ");
359 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
360 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
363 /* When we perform grouped accesses and perform implicit CSE
364 by detecting equal accesses and doing disambiguation with
365 runtime alias tests like for
366 .. = a[i];
367 .. = a[i+1];
368 a[i] = ..;
369 a[i+1] = ..;
370 *p = ..;
371 .. = a[i];
372 .. = a[i+1];
373 where we will end up loading { a[i], a[i+1] } once, make
374 sure that inserting group loads before the first load and
375 stores after the last store will do the right thing.
376 Similar for groups like
377 a[i] = ...;
378 ... = a[i];
379 a[i+1] = ...;
380 where loads from the group interleave with the store. */
381 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
382 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
384 gimple *earlier_stmt;
385 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
386 if (DR_IS_WRITE
387 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
389 if (dump_enabled_p ())
390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
391 "READ_WRITE dependence in interleaving."
392 "\n");
393 return true;
397 continue;
400 if (dist > 0 && DDR_REVERSED_P (ddr))
402 /* If DDR_REVERSED_P the order of the data-refs in DDR was
403 reversed (to make distance vector positive), and the actual
404 distance is negative. */
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
407 "dependence distance negative.\n");
408 /* Record a negative dependence distance to later limit the
409 amount of stmt copying / unrolling we can perform.
410 Only need to handle read-after-write dependence. */
411 if (DR_IS_READ (drb)
412 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
413 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
414 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
415 continue;
418 if (abs (dist) >= 2
419 && abs (dist) < *max_vf)
421 /* The dependence distance requires reduction of the maximal
422 vectorization factor. */
423 *max_vf = abs (dist);
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_NOTE, vect_location,
426 "adjusting maximal vectorization factor to %i\n",
427 *max_vf);
430 if (abs (dist) >= *max_vf)
432 /* Dependence distance does not create dependence, as far as
433 vectorization is concerned, in this case. */
434 if (dump_enabled_p ())
435 dump_printf_loc (MSG_NOTE, vect_location,
436 "dependence distance >= VF.\n");
437 continue;
440 if (dump_enabled_p ())
442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
443 "not vectorized, possible dependence "
444 "between data-refs ");
445 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
446 dump_printf (MSG_NOTE, " and ");
447 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
448 dump_printf (MSG_NOTE, "\n");
451 return true;
454 return false;
457 /* Function vect_analyze_data_ref_dependences.
459 Examine all the data references in the loop, and make sure there do not
460 exist any data dependences between them. Set *MAX_VF according to
461 the maximum vectorization factor the data dependences allow. */
463 bool
464 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
466 unsigned int i;
467 struct data_dependence_relation *ddr;
469 if (dump_enabled_p ())
470 dump_printf_loc (MSG_NOTE, vect_location,
471 "=== vect_analyze_data_ref_dependences ===\n");
473 LOOP_VINFO_DDRS (loop_vinfo)
474 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
475 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
476 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
477 /* We need read-read dependences to compute STMT_VINFO_SAME_ALIGN_REFS. */
478 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
479 &LOOP_VINFO_DDRS (loop_vinfo),
480 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
481 return false;
483 /* For epilogues we either have no aliases or alias versioning
484 was applied to original loop. Therefore we may just get max_vf
485 using VF of original loop. */
486 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
487 *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
488 else
489 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
490 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
491 return false;
493 return true;
497 /* Function vect_slp_analyze_data_ref_dependence.
499 Return TRUE if there (might) exist a dependence between a memory-reference
500 DRA and a memory-reference DRB. When versioning for alias may check a
501 dependence at run-time, return FALSE. Adjust *MAX_VF according to
502 the data dependence. */
504 static bool
505 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
507 struct data_reference *dra = DDR_A (ddr);
508 struct data_reference *drb = DDR_B (ddr);
510 /* We need to check dependences of statements marked as unvectorizable
511 as well, they still can prohibit vectorization. */
513 /* Independent data accesses. */
514 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
515 return false;
517 if (dra == drb)
518 return false;
520 /* Read-read is OK. */
521 if (DR_IS_READ (dra) && DR_IS_READ (drb))
522 return false;
524 /* If dra and drb are part of the same interleaving chain consider
525 them independent. */
526 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
527 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
528 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
529 return false;
531 /* Unknown data dependence. */
532 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
534 if (dump_enabled_p ())
536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
537 "can't determine dependence between ");
538 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
539 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
541 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
544 else if (dump_enabled_p ())
546 dump_printf_loc (MSG_NOTE, vect_location,
547 "determined dependence between ");
548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
549 dump_printf (MSG_NOTE, " and ");
550 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
551 dump_printf (MSG_NOTE, "\n");
554 return true;
558 /* Analyze dependences involved in the transform of SLP NODE. STORES
559 contain the vector of scalar stores of this instance if we are
560 disambiguating the loads. */
562 static bool
563 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
564 vec<gimple *> stores, gimple *last_store)
566 /* This walks over all stmts involved in the SLP load/store done
567 in NODE verifying we can sink them up to the last stmt in the
568 group. */
569 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
570 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
572 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
573 if (access == last_access)
574 continue;
575 data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
576 for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
577 gsi_stmt (gsi) != last_access; gsi_next (&gsi))
579 gimple *stmt = gsi_stmt (gsi);
580 if (! gimple_vuse (stmt)
581 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
582 continue;
584 /* If we couldn't record a (single) data reference for this
585 stmt we have to give up. */
586 /* ??? Here and below if dependence analysis fails we can resort
587 to the alias oracle which can handle more kinds of stmts. */
588 data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
589 if (!dr_b)
590 return false;
592 bool dependent = false;
593 /* If we run into a store of this same instance (we've just
594 marked those) then delay dependence checking until we run
595 into the last store because this is where it will have
596 been sunk to (and we verify if we can do that as well). */
597 if (gimple_visited_p (stmt))
599 if (stmt != last_store)
600 continue;
601 unsigned i;
602 gimple *store;
603 FOR_EACH_VEC_ELT (stores, i, store)
605 data_reference *store_dr
606 = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
607 ddr_p ddr = initialize_data_dependence_relation
608 (dr_a, store_dr, vNULL);
609 dependent = vect_slp_analyze_data_ref_dependence (ddr);
610 free_dependence_relation (ddr);
611 if (dependent)
612 break;
615 else
617 ddr_p ddr = initialize_data_dependence_relation (dr_a,
618 dr_b, vNULL);
619 dependent = vect_slp_analyze_data_ref_dependence (ddr);
620 free_dependence_relation (ddr);
622 if (dependent)
623 return false;
626 return true;
630 /* Function vect_analyze_data_ref_dependences.
632 Examine all the data references in the basic-block, and make sure there
633 do not exist any data dependences between them. Set *MAX_VF according to
634 the maximum vectorization factor the data dependences allow. */
636 bool
637 vect_slp_analyze_instance_dependence (slp_instance instance)
639 if (dump_enabled_p ())
640 dump_printf_loc (MSG_NOTE, vect_location,
641 "=== vect_slp_analyze_instance_dependence ===\n");
643 /* The stores of this instance are at the root of the SLP tree. */
644 slp_tree store = SLP_INSTANCE_TREE (instance);
645 if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
646 store = NULL;
648 /* Verify we can sink stores to the vectorized stmt insert location. */
649 gimple *last_store = NULL;
650 if (store)
652 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
653 return false;
655 /* Mark stores in this instance and remember the last one. */
656 last_store = vect_find_last_scalar_stmt_in_slp (store);
657 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
658 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
661 bool res = true;
663 /* Verify we can sink loads to the vectorized stmt insert location,
664 special-casing stores of this instance. */
665 slp_tree load;
666 unsigned int i;
667 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
668 if (! vect_slp_analyze_node_dependences (instance, load,
669 store
670 ? SLP_TREE_SCALAR_STMTS (store)
671 : vNULL, last_store))
673 res = false;
674 break;
677 /* Unset the visited flag. */
678 if (store)
679 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
680 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
682 return res;
685 /* Function vect_compute_data_ref_alignment
687 Compute the misalignment of the data reference DR.
689 Output:
690 1. If during the misalignment computation it is found that the data reference
691 cannot be vectorized then false is returned.
692 2. DR_MISALIGNMENT (DR) is defined.
694 FOR NOW: No analysis is actually performed. Misalignment is calculated
695 only for trivial cases. TODO. */
697 bool
698 vect_compute_data_ref_alignment (struct data_reference *dr)
700 gimple *stmt = DR_STMT (dr);
701 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
702 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
703 struct loop *loop = NULL;
704 tree ref = DR_REF (dr);
705 tree vectype;
706 tree base, base_addr;
707 tree misalign = NULL_TREE;
708 tree aligned_to;
709 tree step;
710 unsigned HOST_WIDE_INT alignment;
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_NOTE, vect_location,
714 "vect_compute_data_ref_alignment:\n");
716 if (loop_vinfo)
717 loop = LOOP_VINFO_LOOP (loop_vinfo);
719 /* Initialize misalignment to unknown. */
720 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
722 if (tree_fits_shwi_p (DR_STEP (dr)))
723 misalign = DR_INIT (dr);
724 aligned_to = DR_ALIGNED_TO (dr);
725 base_addr = DR_BASE_ADDRESS (dr);
726 vectype = STMT_VINFO_VECTYPE (stmt_info);
728 /* In case the dataref is in an inner-loop of the loop that is being
729 vectorized (LOOP), we use the base and misalignment information
730 relative to the outer-loop (LOOP). This is ok only if the misalignment
731 stays the same throughout the execution of the inner-loop, which is why
732 we have to check that the stride of the dataref in the inner-loop evenly
733 divides by the vector size. */
734 if (loop && nested_in_vect_loop_p (loop, stmt))
736 tree step = DR_STEP (dr);
738 if (tree_fits_shwi_p (step)
739 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_NOTE, vect_location,
743 "inner step divides the vector-size.\n");
744 misalign = STMT_VINFO_DR_INIT (stmt_info);
745 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
746 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
748 else
750 if (dump_enabled_p ())
751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
752 "inner step doesn't divide the vector-size.\n");
753 misalign = NULL_TREE;
757 /* Similarly we can only use base and misalignment information relative to
758 an innermost loop if the misalignment stays the same throughout the
759 execution of the loop. As above, this is the case if the stride of
760 the dataref evenly divides by the vector size. */
761 else
763 tree step = DR_STEP (dr);
764 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
766 if (tree_fits_shwi_p (step)
767 && ((tree_to_shwi (step) * vf)
768 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
770 if (dump_enabled_p ())
771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
772 "step doesn't divide the vector-size.\n");
773 misalign = NULL_TREE;
777 /* To look at alignment of the base we have to preserve an inner MEM_REF
778 as that carries alignment information of the actual access. */
779 base = ref;
780 while (handled_component_p (base))
781 base = TREE_OPERAND (base, 0);
782 unsigned int base_alignment = 0;
783 unsigned HOST_WIDE_INT base_bitpos;
784 get_object_alignment_1 (base, &base_alignment, &base_bitpos);
785 /* As data-ref analysis strips the MEM_REF down to its base operand
786 to form DR_BASE_ADDRESS and adds the offset to DR_INIT we have to
787 adjust things to make base_alignment valid as the alignment of
788 DR_BASE_ADDRESS. */
789 if (TREE_CODE (base) == MEM_REF)
791 /* Note all this only works if DR_BASE_ADDRESS is the same as
792 MEM_REF operand zero, otherwise DR/SCEV analysis might have factored
793 in other offsets. We need to rework DR to compute the alingment
794 of DR_BASE_ADDRESS as long as all information is still available. */
795 if (operand_equal_p (TREE_OPERAND (base, 0), base_addr, 0))
797 base_bitpos -= mem_ref_offset (base).to_short_addr () * BITS_PER_UNIT;
798 base_bitpos &= (base_alignment - 1);
800 else
801 base_bitpos = BITS_PER_UNIT;
803 if (base_bitpos != 0)
804 base_alignment = base_bitpos & -base_bitpos;
805 /* Also look at the alignment of the base address DR analysis
806 computed. */
807 unsigned int base_addr_alignment = get_pointer_alignment (base_addr);
808 if (base_addr_alignment > base_alignment)
809 base_alignment = base_addr_alignment;
811 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
812 DR_VECT_AUX (dr)->base_element_aligned = true;
814 alignment = TYPE_ALIGN_UNIT (vectype);
816 if ((compare_tree_int (aligned_to, alignment) < 0)
817 || !misalign)
819 if (dump_enabled_p ())
821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
822 "Unknown alignment for access: ");
823 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
824 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
826 return true;
829 if (base_alignment < TYPE_ALIGN (vectype))
831 base = base_addr;
832 if (TREE_CODE (base) == ADDR_EXPR)
833 base = TREE_OPERAND (base, 0);
834 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
836 if (dump_enabled_p ())
838 dump_printf_loc (MSG_NOTE, vect_location,
839 "can't force alignment of ref: ");
840 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
841 dump_printf (MSG_NOTE, "\n");
843 return true;
846 if (DECL_USER_ALIGN (base))
848 if (dump_enabled_p ())
850 dump_printf_loc (MSG_NOTE, vect_location,
851 "not forcing alignment of user-aligned "
852 "variable: ");
853 dump_generic_expr (MSG_NOTE, TDF_SLIM, base);
854 dump_printf (MSG_NOTE, "\n");
856 return true;
859 /* Force the alignment of the decl.
860 NOTE: This is the only change to the code we make during
861 the analysis phase, before deciding to vectorize the loop. */
862 if (dump_enabled_p ())
864 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
865 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
866 dump_printf (MSG_NOTE, "\n");
869 DR_VECT_AUX (dr)->base_decl = base;
870 DR_VECT_AUX (dr)->base_misaligned = true;
871 DR_VECT_AUX (dr)->base_element_aligned = true;
874 if (loop && nested_in_vect_loop_p (loop, stmt))
875 step = STMT_VINFO_DR_STEP (stmt_info);
876 else
877 step = DR_STEP (dr);
878 /* If this is a backward running DR then first access in the larger
879 vectype actually is N-1 elements before the address in the DR.
880 Adjust misalign accordingly. */
881 if (tree_int_cst_sgn (step) < 0)
883 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
884 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
885 otherwise we wouldn't be here. */
886 offset = fold_build2 (MULT_EXPR, ssizetype, offset, step);
887 /* PLUS because STEP was negative. */
888 misalign = size_binop (PLUS_EXPR, misalign, offset);
891 SET_DR_MISALIGNMENT (dr,
892 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
894 if (dump_enabled_p ())
896 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
897 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
898 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
899 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
902 return true;
906 /* Function vect_update_misalignment_for_peel.
907 Sets DR's misalignment
908 - to 0 if it has the same alignment as DR_PEEL,
909 - to the misalignment computed using NPEEL if DR's salignment is known,
910 - to -1 (unknown) otherwise.
912 DR - the data reference whose misalignment is to be adjusted.
913 DR_PEEL - the data reference whose misalignment is being made
914 zero in the vector loop by the peel.
915 NPEEL - the number of iterations in the peel loop if the misalignment
916 of DR_PEEL is known at compile time. */
918 static void
919 vect_update_misalignment_for_peel (struct data_reference *dr,
920 struct data_reference *dr_peel, int npeel)
922 unsigned int i;
923 vec<dr_p> same_aligned_drs;
924 struct data_reference *current_dr;
925 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
926 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
927 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
928 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
930 /* For interleaved data accesses the step in the loop must be multiplied by
931 the size of the interleaving group. */
932 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
933 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
934 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
935 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
937 /* It can be assumed that the data refs with the same alignment as dr_peel
938 are aligned in the vector loop. */
939 same_aligned_drs
940 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
941 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
943 if (current_dr != dr)
944 continue;
945 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
946 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
947 SET_DR_MISALIGNMENT (dr, 0);
948 return;
951 if (known_alignment_for_access_p (dr)
952 && known_alignment_for_access_p (dr_peel))
954 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
955 int misal = DR_MISALIGNMENT (dr);
956 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
957 misal += negative ? -npeel * dr_size : npeel * dr_size;
958 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
959 SET_DR_MISALIGNMENT (dr, misal);
960 return;
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
965 "to unknown (-1).\n");
966 SET_DR_MISALIGNMENT (dr, DR_MISALIGNMENT_UNKNOWN);
970 /* Function verify_data_ref_alignment
972 Return TRUE if DR can be handled with respect to alignment. */
974 static bool
975 verify_data_ref_alignment (data_reference_p dr)
977 enum dr_alignment_support supportable_dr_alignment
978 = vect_supportable_dr_alignment (dr, false);
979 if (!supportable_dr_alignment)
981 if (dump_enabled_p ())
983 if (DR_IS_READ (dr))
984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
985 "not vectorized: unsupported unaligned load.");
986 else
987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
988 "not vectorized: unsupported unaligned "
989 "store.");
991 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
992 DR_REF (dr));
993 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
995 return false;
998 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
999 dump_printf_loc (MSG_NOTE, vect_location,
1000 "Vectorizing an unaligned access.\n");
1002 return true;
1005 /* Function vect_verify_datarefs_alignment
1007 Return TRUE if all data references in the loop can be
1008 handled with respect to alignment. */
1010 bool
1011 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1013 vec<data_reference_p> datarefs = vinfo->datarefs;
1014 struct data_reference *dr;
1015 unsigned int i;
1017 FOR_EACH_VEC_ELT (datarefs, i, dr)
1019 gimple *stmt = DR_STMT (dr);
1020 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1022 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1023 continue;
1025 /* For interleaving, only the alignment of the first access matters. */
1026 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1027 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1028 continue;
1030 /* Strided accesses perform only component accesses, alignment is
1031 irrelevant for them. */
1032 if (STMT_VINFO_STRIDED_P (stmt_info)
1033 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1034 continue;
1036 if (! verify_data_ref_alignment (dr))
1037 return false;
1040 return true;
1043 /* Given an memory reference EXP return whether its alignment is less
1044 than its size. */
1046 static bool
1047 not_size_aligned (tree exp)
1049 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1050 return true;
1052 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1053 > get_object_alignment (exp));
1056 /* Function vector_alignment_reachable_p
1058 Return true if vector alignment for DR is reachable by peeling
1059 a few loop iterations. Return false otherwise. */
1061 static bool
1062 vector_alignment_reachable_p (struct data_reference *dr)
1064 gimple *stmt = DR_STMT (dr);
1065 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1066 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1068 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1070 /* For interleaved access we peel only if number of iterations in
1071 the prolog loop ({VF - misalignment}), is a multiple of the
1072 number of the interleaved accesses. */
1073 int elem_size, mis_in_elements;
1074 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1076 /* FORNOW: handle only known alignment. */
1077 if (!known_alignment_for_access_p (dr))
1078 return false;
1080 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1081 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1083 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1084 return false;
1087 /* If misalignment is known at the compile time then allow peeling
1088 only if natural alignment is reachable through peeling. */
1089 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1091 HOST_WIDE_INT elmsize =
1092 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1093 if (dump_enabled_p ())
1095 dump_printf_loc (MSG_NOTE, vect_location,
1096 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1097 dump_printf (MSG_NOTE,
1098 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1100 if (DR_MISALIGNMENT (dr) % elmsize)
1102 if (dump_enabled_p ())
1103 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1104 "data size does not divide the misalignment.\n");
1105 return false;
1109 if (!known_alignment_for_access_p (dr))
1111 tree type = TREE_TYPE (DR_REF (dr));
1112 bool is_packed = not_size_aligned (DR_REF (dr));
1113 if (dump_enabled_p ())
1114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1115 "Unknown misalignment, %snaturally aligned\n",
1116 is_packed ? "not " : "");
1117 return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1120 return true;
1124 /* Calculate the cost of the memory access represented by DR. */
1126 static void
1127 vect_get_data_access_cost (struct data_reference *dr,
1128 unsigned int *inside_cost,
1129 unsigned int *outside_cost,
1130 stmt_vector_for_cost *body_cost_vec)
1132 gimple *stmt = DR_STMT (dr);
1133 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1134 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1135 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1136 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1137 int ncopies = MAX (1, vf / nunits); /* TODO: Handle SLP properly */
1139 if (DR_IS_READ (dr))
1140 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1141 NULL, body_cost_vec, false);
1142 else
1143 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1145 if (dump_enabled_p ())
1146 dump_printf_loc (MSG_NOTE, vect_location,
1147 "vect_get_data_access_cost: inside_cost = %d, "
1148 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1152 typedef struct _vect_peel_info
1154 struct data_reference *dr;
1155 int npeel;
1156 unsigned int count;
1157 } *vect_peel_info;
1159 typedef struct _vect_peel_extended_info
1161 struct _vect_peel_info peel_info;
1162 unsigned int inside_cost;
1163 unsigned int outside_cost;
1164 stmt_vector_for_cost body_cost_vec;
1165 } *vect_peel_extended_info;
1168 /* Peeling hashtable helpers. */
1170 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1172 static inline hashval_t hash (const _vect_peel_info *);
1173 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1176 inline hashval_t
1177 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1179 return (hashval_t) peel_info->npeel;
1182 inline bool
1183 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1185 return (a->npeel == b->npeel);
1189 /* Insert DR into peeling hash table with NPEEL as key. */
1191 static void
1192 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1193 loop_vec_info loop_vinfo, struct data_reference *dr,
1194 int npeel)
1196 struct _vect_peel_info elem, *slot;
1197 _vect_peel_info **new_slot;
1198 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1200 elem.npeel = npeel;
1201 slot = peeling_htab->find (&elem);
1202 if (slot)
1203 slot->count++;
1204 else
1206 slot = XNEW (struct _vect_peel_info);
1207 slot->npeel = npeel;
1208 slot->dr = dr;
1209 slot->count = 1;
1210 new_slot = peeling_htab->find_slot (slot, INSERT);
1211 *new_slot = slot;
1214 if (!supportable_dr_alignment
1215 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1216 slot->count += VECT_MAX_COST;
1220 /* Traverse peeling hash table to find peeling option that aligns maximum
1221 number of data accesses. */
1224 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1225 _vect_peel_extended_info *max)
1227 vect_peel_info elem = *slot;
1229 if (elem->count > max->peel_info.count
1230 || (elem->count == max->peel_info.count
1231 && max->peel_info.npeel > elem->npeel))
1233 max->peel_info.npeel = elem->npeel;
1234 max->peel_info.count = elem->count;
1235 max->peel_info.dr = elem->dr;
1238 return 1;
1241 /* Get the costs of peeling NPEEL iterations checking data access costs
1242 for all data refs. If UNKNOWN_MISALIGNMENT is true, we assume DR0's
1243 misalignment will be zero after peeling. */
1245 static void
1246 vect_get_peeling_costs_all_drs (struct data_reference *dr0,
1247 unsigned int *inside_cost,
1248 unsigned int *outside_cost,
1249 stmt_vector_for_cost *body_cost_vec,
1250 unsigned int npeel,
1251 bool unknown_misalignment)
1253 gimple *stmt = DR_STMT (dr0);
1254 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1255 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1256 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1258 unsigned i;
1259 data_reference *dr;
1261 FOR_EACH_VEC_ELT (datarefs, i, dr)
1263 stmt = DR_STMT (dr);
1264 stmt_info = vinfo_for_stmt (stmt);
1265 /* For interleaving, only the alignment of the first access
1266 matters. */
1267 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1268 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1269 continue;
1271 /* Strided accesses perform only component accesses, alignment is
1272 irrelevant for them. */
1273 if (STMT_VINFO_STRIDED_P (stmt_info)
1274 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1275 continue;
1277 int save_misalignment;
1278 save_misalignment = DR_MISALIGNMENT (dr);
1279 if (unknown_misalignment && dr == dr0)
1280 SET_DR_MISALIGNMENT (dr, 0);
1281 else
1282 vect_update_misalignment_for_peel (dr, dr0, npeel);
1283 vect_get_data_access_cost (dr, inside_cost, outside_cost,
1284 body_cost_vec);
1285 SET_DR_MISALIGNMENT (dr, save_misalignment);
1289 /* Traverse peeling hash table and calculate cost for each peeling option.
1290 Find the one with the lowest cost. */
1293 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1294 _vect_peel_extended_info *min)
1296 vect_peel_info elem = *slot;
1297 int dummy;
1298 unsigned int inside_cost = 0, outside_cost = 0;
1299 gimple *stmt = DR_STMT (elem->dr);
1300 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1301 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1302 stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1303 epilogue_cost_vec;
1305 prologue_cost_vec.create (2);
1306 body_cost_vec.create (2);
1307 epilogue_cost_vec.create (2);
1309 vect_get_peeling_costs_all_drs (elem->dr, &inside_cost, &outside_cost,
1310 &body_cost_vec, elem->npeel, false);
1312 outside_cost += vect_get_known_peeling_cost
1313 (loop_vinfo, elem->npeel, &dummy,
1314 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1315 &prologue_cost_vec, &epilogue_cost_vec);
1317 /* Prologue and epilogue costs are added to the target model later.
1318 These costs depend only on the scalar iteration cost, the
1319 number of peeling iterations finally chosen, and the number of
1320 misaligned statements. So discard the information found here. */
1321 prologue_cost_vec.release ();
1322 epilogue_cost_vec.release ();
1324 if (inside_cost < min->inside_cost
1325 || (inside_cost == min->inside_cost
1326 && outside_cost < min->outside_cost))
1328 min->inside_cost = inside_cost;
1329 min->outside_cost = outside_cost;
1330 min->body_cost_vec.release ();
1331 min->body_cost_vec = body_cost_vec;
1332 min->peel_info.dr = elem->dr;
1333 min->peel_info.npeel = elem->npeel;
1334 min->peel_info.count = elem->count;
1336 else
1337 body_cost_vec.release ();
1339 return 1;
1343 /* Choose best peeling option by traversing peeling hash table and either
1344 choosing an option with the lowest cost (if cost model is enabled) or the
1345 option that aligns as many accesses as possible. */
1347 static struct _vect_peel_extended_info
1348 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1349 loop_vec_info loop_vinfo,
1350 unsigned int *npeel,
1351 stmt_vector_for_cost *body_cost_vec)
1353 struct _vect_peel_extended_info res;
1355 res.peel_info.dr = NULL;
1356 res.body_cost_vec = stmt_vector_for_cost ();
1358 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1360 res.inside_cost = INT_MAX;
1361 res.outside_cost = INT_MAX;
1362 peeling_htab->traverse <_vect_peel_extended_info *,
1363 vect_peeling_hash_get_lowest_cost> (&res);
1365 else
1367 res.peel_info.count = 0;
1368 peeling_htab->traverse <_vect_peel_extended_info *,
1369 vect_peeling_hash_get_most_frequent> (&res);
1370 res.inside_cost = 0;
1371 res.outside_cost = 0;
1374 *npeel = res.peel_info.npeel;
1375 *body_cost_vec = res.body_cost_vec;
1376 return res;
1379 /* Return true if the new peeling NPEEL is supported. */
1381 static bool
1382 vect_peeling_supportable (loop_vec_info loop_vinfo, struct data_reference *dr0,
1383 unsigned npeel)
1385 unsigned i;
1386 struct data_reference *dr = NULL;
1387 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1388 gimple *stmt;
1389 stmt_vec_info stmt_info;
1390 enum dr_alignment_support supportable_dr_alignment;
1392 /* Ensure that all data refs can be vectorized after the peel. */
1393 FOR_EACH_VEC_ELT (datarefs, i, dr)
1395 int save_misalignment;
1397 if (dr == dr0)
1398 continue;
1400 stmt = DR_STMT (dr);
1401 stmt_info = vinfo_for_stmt (stmt);
1402 /* For interleaving, only the alignment of the first access
1403 matters. */
1404 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1405 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1406 continue;
1408 /* Strided accesses perform only component accesses, alignment is
1409 irrelevant for them. */
1410 if (STMT_VINFO_STRIDED_P (stmt_info)
1411 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1412 continue;
1414 save_misalignment = DR_MISALIGNMENT (dr);
1415 vect_update_misalignment_for_peel (dr, dr0, npeel);
1416 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1417 SET_DR_MISALIGNMENT (dr, save_misalignment);
1419 if (!supportable_dr_alignment)
1420 return false;
1423 return true;
1426 /* Function vect_enhance_data_refs_alignment
1428 This pass will use loop versioning and loop peeling in order to enhance
1429 the alignment of data references in the loop.
1431 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1432 original loop is to be vectorized. Any other loops that are created by
1433 the transformations performed in this pass - are not supposed to be
1434 vectorized. This restriction will be relaxed.
1436 This pass will require a cost model to guide it whether to apply peeling
1437 or versioning or a combination of the two. For example, the scheme that
1438 intel uses when given a loop with several memory accesses, is as follows:
1439 choose one memory access ('p') which alignment you want to force by doing
1440 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1441 other accesses are not necessarily aligned, or (2) use loop versioning to
1442 generate one loop in which all accesses are aligned, and another loop in
1443 which only 'p' is necessarily aligned.
1445 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1446 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1447 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1449 Devising a cost model is the most critical aspect of this work. It will
1450 guide us on which access to peel for, whether to use loop versioning, how
1451 many versions to create, etc. The cost model will probably consist of
1452 generic considerations as well as target specific considerations (on
1453 powerpc for example, misaligned stores are more painful than misaligned
1454 loads).
1456 Here are the general steps involved in alignment enhancements:
1458 -- original loop, before alignment analysis:
1459 for (i=0; i<N; i++){
1460 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1461 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1464 -- After vect_compute_data_refs_alignment:
1465 for (i=0; i<N; i++){
1466 x = q[i]; # DR_MISALIGNMENT(q) = 3
1467 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1470 -- Possibility 1: we do loop versioning:
1471 if (p is aligned) {
1472 for (i=0; i<N; i++){ # loop 1A
1473 x = q[i]; # DR_MISALIGNMENT(q) = 3
1474 p[i] = y; # DR_MISALIGNMENT(p) = 0
1477 else {
1478 for (i=0; i<N; i++){ # loop 1B
1479 x = q[i]; # DR_MISALIGNMENT(q) = 3
1480 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1484 -- Possibility 2: we do loop peeling:
1485 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1486 x = q[i];
1487 p[i] = y;
1489 for (i = 3; i < N; i++){ # loop 2A
1490 x = q[i]; # DR_MISALIGNMENT(q) = 0
1491 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1494 -- Possibility 3: combination of loop peeling and versioning:
1495 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1496 x = q[i];
1497 p[i] = y;
1499 if (p is aligned) {
1500 for (i = 3; i<N; i++){ # loop 3A
1501 x = q[i]; # DR_MISALIGNMENT(q) = 0
1502 p[i] = y; # DR_MISALIGNMENT(p) = 0
1505 else {
1506 for (i = 3; i<N; i++){ # loop 3B
1507 x = q[i]; # DR_MISALIGNMENT(q) = 0
1508 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1512 These loops are later passed to loop_transform to be vectorized. The
1513 vectorizer will use the alignment information to guide the transformation
1514 (whether to generate regular loads/stores, or with special handling for
1515 misalignment). */
1517 bool
1518 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1520 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1521 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1522 enum dr_alignment_support supportable_dr_alignment;
1523 struct data_reference *dr0 = NULL, *first_store = NULL;
1524 struct data_reference *dr;
1525 unsigned int i, j;
1526 bool do_peeling = false;
1527 bool do_versioning = false;
1528 bool stat;
1529 gimple *stmt;
1530 stmt_vec_info stmt_info;
1531 unsigned int npeel = 0;
1532 bool one_misalignment_known = false;
1533 bool one_misalignment_unknown = false;
1534 bool one_dr_unsupportable = false;
1535 struct data_reference *unsupportable_dr = NULL;
1536 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1537 unsigned possible_npeel_number = 1;
1538 tree vectype;
1539 unsigned int nelements, mis, same_align_drs_max = 0;
1540 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1541 hash_table<peel_info_hasher> peeling_htab (1);
1543 if (dump_enabled_p ())
1544 dump_printf_loc (MSG_NOTE, vect_location,
1545 "=== vect_enhance_data_refs_alignment ===\n");
1547 /* Reset data so we can safely be called multiple times. */
1548 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1549 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1551 /* While cost model enhancements are expected in the future, the high level
1552 view of the code at this time is as follows:
1554 A) If there is a misaligned access then see if peeling to align
1555 this access can make all data references satisfy
1556 vect_supportable_dr_alignment. If so, update data structures
1557 as needed and return true.
1559 B) If peeling wasn't possible and there is a data reference with an
1560 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1561 then see if loop versioning checks can be used to make all data
1562 references satisfy vect_supportable_dr_alignment. If so, update
1563 data structures as needed and return true.
1565 C) If neither peeling nor versioning were successful then return false if
1566 any data reference does not satisfy vect_supportable_dr_alignment.
1568 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1570 Note, Possibility 3 above (which is peeling and versioning together) is not
1571 being done at this time. */
1573 /* (1) Peeling to force alignment. */
1575 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1576 Considerations:
1577 + How many accesses will become aligned due to the peeling
1578 - How many accesses will become unaligned due to the peeling,
1579 and the cost of misaligned accesses.
1580 - The cost of peeling (the extra runtime checks, the increase
1581 in code size). */
1583 FOR_EACH_VEC_ELT (datarefs, i, dr)
1585 stmt = DR_STMT (dr);
1586 stmt_info = vinfo_for_stmt (stmt);
1588 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1589 continue;
1591 /* For interleaving, only the alignment of the first access
1592 matters. */
1593 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1594 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1595 continue;
1597 /* For invariant accesses there is nothing to enhance. */
1598 if (integer_zerop (DR_STEP (dr)))
1599 continue;
1601 /* Strided accesses perform only component accesses, alignment is
1602 irrelevant for them. */
1603 if (STMT_VINFO_STRIDED_P (stmt_info)
1604 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1605 continue;
1607 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1608 do_peeling = vector_alignment_reachable_p (dr);
1609 if (do_peeling)
1611 if (known_alignment_for_access_p (dr))
1613 unsigned int npeel_tmp = 0;
1614 bool negative = tree_int_cst_compare (DR_STEP (dr),
1615 size_zero_node) < 0;
1617 vectype = STMT_VINFO_VECTYPE (stmt_info);
1618 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1619 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1620 TREE_TYPE (DR_REF (dr))));
1621 if (DR_MISALIGNMENT (dr) != 0)
1622 npeel_tmp = (negative ? (mis - nelements)
1623 : (nelements - mis)) & (nelements - 1);
1625 /* For multiple types, it is possible that the bigger type access
1626 will have more than one peeling option. E.g., a loop with two
1627 types: one of size (vector size / 4), and the other one of
1628 size (vector size / 8). Vectorization factor will 8. If both
1629 accesses are misaligned by 3, the first one needs one scalar
1630 iteration to be aligned, and the second one needs 5. But the
1631 first one will be aligned also by peeling 5 scalar
1632 iterations, and in that case both accesses will be aligned.
1633 Hence, except for the immediate peeling amount, we also want
1634 to try to add full vector size, while we don't exceed
1635 vectorization factor.
1636 We do this automatically for cost model, since we calculate
1637 cost for every peeling option. */
1638 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1640 if (STMT_SLP_TYPE (stmt_info))
1641 possible_npeel_number
1642 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1643 else
1644 possible_npeel_number = vf / nelements;
1646 /* NPEEL_TMP is 0 when there is no misalignment, but also
1647 allow peeling NELEMENTS. */
1648 if (DR_MISALIGNMENT (dr) == 0)
1649 possible_npeel_number++;
1652 /* Save info about DR in the hash table. Also include peeling
1653 amounts according to the explanation above. */
1654 for (j = 0; j < possible_npeel_number; j++)
1656 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1657 dr, npeel_tmp);
1658 npeel_tmp += nelements;
1661 one_misalignment_known = true;
1663 else
1665 /* If we don't know any misalignment values, we prefer
1666 peeling for data-ref that has the maximum number of data-refs
1667 with the same alignment, unless the target prefers to align
1668 stores over load. */
1669 unsigned same_align_drs
1670 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1671 if (!dr0
1672 || same_align_drs_max < same_align_drs)
1674 same_align_drs_max = same_align_drs;
1675 dr0 = dr;
1677 /* For data-refs with the same number of related
1678 accesses prefer the one where the misalign
1679 computation will be invariant in the outermost loop. */
1680 else if (same_align_drs_max == same_align_drs)
1682 struct loop *ivloop0, *ivloop;
1683 ivloop0 = outermost_invariant_loop_for_expr
1684 (loop, DR_BASE_ADDRESS (dr0));
1685 ivloop = outermost_invariant_loop_for_expr
1686 (loop, DR_BASE_ADDRESS (dr));
1687 if ((ivloop && !ivloop0)
1688 || (ivloop && ivloop0
1689 && flow_loop_nested_p (ivloop, ivloop0)))
1690 dr0 = dr;
1693 one_misalignment_unknown = true;
1695 /* Check for data refs with unsupportable alignment that
1696 can be peeled. */
1697 if (!supportable_dr_alignment)
1699 one_dr_unsupportable = true;
1700 unsupportable_dr = dr;
1703 if (!first_store && DR_IS_WRITE (dr))
1704 first_store = dr;
1707 else
1709 if (!aligned_access_p (dr))
1711 if (dump_enabled_p ())
1712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1713 "vector alignment may not be reachable\n");
1714 break;
1719 /* Check if we can possibly peel the loop. */
1720 if (!vect_can_advance_ivs_p (loop_vinfo)
1721 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1722 || loop->inner)
1723 do_peeling = false;
1725 struct _vect_peel_extended_info peel_for_known_alignment;
1726 struct _vect_peel_extended_info peel_for_unknown_alignment;
1727 struct _vect_peel_extended_info best_peel;
1729 peel_for_unknown_alignment.inside_cost = INT_MAX;
1730 peel_for_unknown_alignment.outside_cost = INT_MAX;
1731 peel_for_unknown_alignment.peel_info.count = 0;
1733 if (do_peeling
1734 && one_misalignment_unknown)
1736 /* Check if the target requires to prefer stores over loads, i.e., if
1737 misaligned stores are more expensive than misaligned loads (taking
1738 drs with same alignment into account). */
1739 unsigned int load_inside_cost = 0;
1740 unsigned int load_outside_cost = 0;
1741 unsigned int store_inside_cost = 0;
1742 unsigned int store_outside_cost = 0;
1744 stmt_vector_for_cost dummy;
1745 dummy.create (2);
1746 vect_get_peeling_costs_all_drs (dr0,
1747 &load_inside_cost,
1748 &load_outside_cost,
1749 &dummy, vf / 2, true);
1750 dummy.release ();
1752 if (first_store)
1754 dummy.create (2);
1755 vect_get_peeling_costs_all_drs (first_store,
1756 &store_inside_cost,
1757 &store_outside_cost,
1758 &dummy, vf / 2, true);
1759 dummy.release ();
1761 else
1763 store_inside_cost = INT_MAX;
1764 store_outside_cost = INT_MAX;
1767 if (load_inside_cost > store_inside_cost
1768 || (load_inside_cost == store_inside_cost
1769 && load_outside_cost > store_outside_cost))
1771 dr0 = first_store;
1772 peel_for_unknown_alignment.inside_cost = store_inside_cost;
1773 peel_for_unknown_alignment.outside_cost = store_outside_cost;
1775 else
1777 peel_for_unknown_alignment.inside_cost = load_inside_cost;
1778 peel_for_unknown_alignment.outside_cost = load_outside_cost;
1781 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1782 prologue_cost_vec.create (2);
1783 epilogue_cost_vec.create (2);
1785 int dummy2;
1786 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1787 (loop_vinfo, vf / 2, &dummy2,
1788 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1789 &prologue_cost_vec, &epilogue_cost_vec);
1791 prologue_cost_vec.release ();
1792 epilogue_cost_vec.release ();
1794 peel_for_unknown_alignment.peel_info.count = 1
1795 + STMT_VINFO_SAME_ALIGN_REFS
1796 (vinfo_for_stmt (DR_STMT (dr0))).length ();
1799 peel_for_unknown_alignment.peel_info.npeel = 0;
1800 peel_for_unknown_alignment.peel_info.dr = dr0;
1802 best_peel = peel_for_unknown_alignment;
1804 peel_for_known_alignment.inside_cost = INT_MAX;
1805 peel_for_known_alignment.outside_cost = INT_MAX;
1806 peel_for_known_alignment.peel_info.count = 0;
1807 peel_for_known_alignment.peel_info.dr = NULL;
1809 if (do_peeling && one_misalignment_known)
1811 /* Peeling is possible, but there is no data access that is not supported
1812 unless aligned. So we try to choose the best possible peeling from
1813 the hash table. */
1814 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1815 (&peeling_htab, loop_vinfo, &npeel, &body_cost_vec);
1818 /* Compare costs of peeling for known and unknown alignment. */
1819 if (peel_for_known_alignment.peel_info.dr != NULL
1820 && peel_for_unknown_alignment.inside_cost
1821 >= peel_for_known_alignment.inside_cost)
1823 best_peel = peel_for_known_alignment;
1825 /* If the best peeling for known alignment has NPEEL == 0, perform no
1826 peeling at all except if there is an unsupportable dr that we can
1827 align. */
1828 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1829 do_peeling = false;
1832 /* If there is an unsupportable data ref, prefer this over all choices so far
1833 since we'd have to discard a chosen peeling except when it accidentally
1834 aligned the unsupportable data ref. */
1835 if (one_dr_unsupportable)
1836 dr0 = unsupportable_dr;
1837 else if (do_peeling)
1839 /* Calculate the penalty for no peeling, i.e. leaving everything
1840 unaligned.
1841 TODO: Adapt vect_get_peeling_costs_all_drs and use here. */
1842 unsigned nopeel_inside_cost = 0;
1843 unsigned nopeel_outside_cost = 0;
1845 stmt_vector_for_cost dummy;
1846 dummy.create (2);
1847 FOR_EACH_VEC_ELT (datarefs, i, dr)
1848 vect_get_data_access_cost (dr, &nopeel_inside_cost,
1849 &nopeel_outside_cost, &dummy);
1850 dummy.release ();
1852 /* Add epilogue costs. As we do not peel for alignment here, no prologue
1853 costs will be recorded. */
1854 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1855 prologue_cost_vec.create (2);
1856 epilogue_cost_vec.create (2);
1858 int dummy2;
1859 nopeel_outside_cost += vect_get_known_peeling_cost
1860 (loop_vinfo, 0, &dummy2,
1861 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1862 &prologue_cost_vec, &epilogue_cost_vec);
1864 prologue_cost_vec.release ();
1865 epilogue_cost_vec.release ();
1867 npeel = best_peel.peel_info.npeel;
1868 dr0 = best_peel.peel_info.dr;
1870 /* If no peeling is not more expensive than the best peeling we
1871 have so far, don't perform any peeling. */
1872 if (nopeel_inside_cost <= best_peel.inside_cost)
1873 do_peeling = false;
1876 if (do_peeling)
1878 stmt = DR_STMT (dr0);
1879 stmt_info = vinfo_for_stmt (stmt);
1880 vectype = STMT_VINFO_VECTYPE (stmt_info);
1881 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1883 if (known_alignment_for_access_p (dr0))
1885 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1886 size_zero_node) < 0;
1887 if (!npeel)
1889 /* Since it's known at compile time, compute the number of
1890 iterations in the peeled loop (the peeling factor) for use in
1891 updating DR_MISALIGNMENT values. The peeling factor is the
1892 vectorization factor minus the misalignment as an element
1893 count. */
1894 mis = DR_MISALIGNMENT (dr0);
1895 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1896 npeel = ((negative ? mis - nelements : nelements - mis)
1897 & (nelements - 1));
1900 /* For interleaved data access every iteration accesses all the
1901 members of the group, therefore we divide the number of iterations
1902 by the group size. */
1903 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1904 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1905 npeel /= GROUP_SIZE (stmt_info);
1907 if (dump_enabled_p ())
1908 dump_printf_loc (MSG_NOTE, vect_location,
1909 "Try peeling by %d\n", npeel);
1912 /* Ensure that all datarefs can be vectorized after the peel. */
1913 if (!vect_peeling_supportable (loop_vinfo, dr0, npeel))
1914 do_peeling = false;
1916 /* Check if all datarefs are supportable and log. */
1917 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1919 stat = vect_verify_datarefs_alignment (loop_vinfo);
1920 if (!stat)
1921 do_peeling = false;
1922 else
1924 body_cost_vec.release ();
1925 return stat;
1929 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1930 if (do_peeling)
1932 unsigned max_allowed_peel
1933 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1934 if (max_allowed_peel != (unsigned)-1)
1936 unsigned max_peel = npeel;
1937 if (max_peel == 0)
1939 gimple *dr_stmt = DR_STMT (dr0);
1940 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1941 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1942 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1944 if (max_peel > max_allowed_peel)
1946 do_peeling = false;
1947 if (dump_enabled_p ())
1948 dump_printf_loc (MSG_NOTE, vect_location,
1949 "Disable peeling, max peels reached: %d\n", max_peel);
1954 /* Cost model #2 - if peeling may result in a remaining loop not
1955 iterating enough to be vectorized then do not peel. */
1956 if (do_peeling
1957 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1959 unsigned max_peel
1960 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1961 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1962 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1963 do_peeling = false;
1966 if (do_peeling)
1968 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1969 If the misalignment of DR_i is identical to that of dr0 then set
1970 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1971 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1972 by the peeling factor times the element size of DR_i (MOD the
1973 vectorization factor times the size). Otherwise, the
1974 misalignment of DR_i must be set to unknown. */
1975 FOR_EACH_VEC_ELT (datarefs, i, dr)
1976 if (dr != dr0)
1978 /* Strided accesses perform only component accesses, alignment
1979 is irrelevant for them. */
1980 stmt_info = vinfo_for_stmt (DR_STMT (dr));
1981 if (STMT_VINFO_STRIDED_P (stmt_info)
1982 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1983 continue;
1985 vect_update_misalignment_for_peel (dr, dr0, npeel);
1988 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1989 if (npeel)
1990 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1991 else
1992 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1993 = DR_MISALIGNMENT (dr0);
1994 SET_DR_MISALIGNMENT (dr0, 0);
1995 if (dump_enabled_p ())
1997 dump_printf_loc (MSG_NOTE, vect_location,
1998 "Alignment of access forced using peeling.\n");
1999 dump_printf_loc (MSG_NOTE, vect_location,
2000 "Peeling for alignment will be applied.\n");
2002 /* The inside-loop cost will be accounted for in vectorizable_load
2003 and vectorizable_store correctly with adjusted alignments.
2004 Drop the body_cst_vec on the floor here. */
2005 body_cost_vec.release ();
2007 stat = vect_verify_datarefs_alignment (loop_vinfo);
2008 gcc_assert (stat);
2009 return stat;
2013 body_cost_vec.release ();
2015 /* (2) Versioning to force alignment. */
2017 /* Try versioning if:
2018 1) optimize loop for speed
2019 2) there is at least one unsupported misaligned data ref with an unknown
2020 misalignment, and
2021 3) all misaligned data refs with a known misalignment are supported, and
2022 4) the number of runtime alignment checks is within reason. */
2024 do_versioning =
2025 optimize_loop_nest_for_speed_p (loop)
2026 && (!loop->inner); /* FORNOW */
2028 if (do_versioning)
2030 FOR_EACH_VEC_ELT (datarefs, i, dr)
2032 stmt = DR_STMT (dr);
2033 stmt_info = vinfo_for_stmt (stmt);
2035 /* For interleaving, only the alignment of the first access
2036 matters. */
2037 if (aligned_access_p (dr)
2038 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2039 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2040 continue;
2042 if (STMT_VINFO_STRIDED_P (stmt_info))
2044 /* Strided loads perform only component accesses, alignment is
2045 irrelevant for them. */
2046 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2047 continue;
2048 do_versioning = false;
2049 break;
2052 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2054 if (!supportable_dr_alignment)
2056 gimple *stmt;
2057 int mask;
2058 tree vectype;
2060 if (known_alignment_for_access_p (dr)
2061 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2062 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2064 do_versioning = false;
2065 break;
2068 stmt = DR_STMT (dr);
2069 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2070 gcc_assert (vectype);
2072 /* The rightmost bits of an aligned address must be zeros.
2073 Construct the mask needed for this test. For example,
2074 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2075 mask must be 15 = 0xf. */
2076 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2078 /* FORNOW: use the same mask to test all potentially unaligned
2079 references in the loop. The vectorizer currently supports
2080 a single vector size, see the reference to
2081 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2082 vectorization factor is computed. */
2083 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2084 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2085 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2086 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
2087 DR_STMT (dr));
2091 /* Versioning requires at least one misaligned data reference. */
2092 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2093 do_versioning = false;
2094 else if (!do_versioning)
2095 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2098 if (do_versioning)
2100 vec<gimple *> may_misalign_stmts
2101 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2102 gimple *stmt;
2104 /* It can now be assumed that the data references in the statements
2105 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2106 of the loop being vectorized. */
2107 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
2109 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2110 dr = STMT_VINFO_DATA_REF (stmt_info);
2111 SET_DR_MISALIGNMENT (dr, 0);
2112 if (dump_enabled_p ())
2113 dump_printf_loc (MSG_NOTE, vect_location,
2114 "Alignment of access forced using versioning.\n");
2117 if (dump_enabled_p ())
2118 dump_printf_loc (MSG_NOTE, vect_location,
2119 "Versioning for alignment will be applied.\n");
2121 /* Peeling and versioning can't be done together at this time. */
2122 gcc_assert (! (do_peeling && do_versioning));
2124 stat = vect_verify_datarefs_alignment (loop_vinfo);
2125 gcc_assert (stat);
2126 return stat;
2129 /* This point is reached if neither peeling nor versioning is being done. */
2130 gcc_assert (! (do_peeling || do_versioning));
2132 stat = vect_verify_datarefs_alignment (loop_vinfo);
2133 return stat;
2137 /* Function vect_find_same_alignment_drs.
2139 Update group and alignment relations according to the chosen
2140 vectorization factor. */
2142 static void
2143 vect_find_same_alignment_drs (struct data_dependence_relation *ddr)
2145 struct data_reference *dra = DDR_A (ddr);
2146 struct data_reference *drb = DDR_B (ddr);
2147 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2148 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2150 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2151 return;
2153 if (dra == drb)
2154 return;
2156 if (!operand_equal_p (DR_BASE_OBJECT (dra), DR_BASE_OBJECT (drb),
2157 OEP_ADDRESS_OF)
2158 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2159 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2160 return;
2162 /* Two references with distance zero have the same alignment. */
2163 offset_int diff = (wi::to_offset (DR_INIT (dra))
2164 - wi::to_offset (DR_INIT (drb)));
2165 if (diff != 0)
2167 /* Get the wider of the two alignments. */
2168 unsigned int align_a = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_a));
2169 unsigned int align_b = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmtinfo_b));
2170 unsigned int max_align = MAX (align_a, align_b);
2172 /* Require the gap to be a multiple of the larger vector alignment. */
2173 if (!wi::multiple_of_p (diff, max_align, SIGNED))
2174 return;
2177 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2178 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2179 if (dump_enabled_p ())
2181 dump_printf_loc (MSG_NOTE, vect_location,
2182 "accesses have the same alignment: ");
2183 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2184 dump_printf (MSG_NOTE, " and ");
2185 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2186 dump_printf (MSG_NOTE, "\n");
2191 /* Function vect_analyze_data_refs_alignment
2193 Analyze the alignment of the data-references in the loop.
2194 Return FALSE if a data reference is found that cannot be vectorized. */
2196 bool
2197 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2199 if (dump_enabled_p ())
2200 dump_printf_loc (MSG_NOTE, vect_location,
2201 "=== vect_analyze_data_refs_alignment ===\n");
2203 /* Mark groups of data references with same alignment using
2204 data dependence information. */
2205 vec<ddr_p> ddrs = vinfo->ddrs;
2206 struct data_dependence_relation *ddr;
2207 unsigned int i;
2209 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2210 vect_find_same_alignment_drs (ddr);
2212 vec<data_reference_p> datarefs = vinfo->datarefs;
2213 struct data_reference *dr;
2215 FOR_EACH_VEC_ELT (datarefs, i, dr)
2217 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
2218 if (STMT_VINFO_VECTORIZABLE (stmt_info)
2219 && !vect_compute_data_ref_alignment (dr))
2221 /* Strided accesses perform only component accesses, misalignment
2222 information is irrelevant for them. */
2223 if (STMT_VINFO_STRIDED_P (stmt_info)
2224 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2225 continue;
2227 if (dump_enabled_p ())
2228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2229 "not vectorized: can't calculate alignment "
2230 "for data ref.\n");
2232 return false;
2236 return true;
2240 /* Analyze alignment of DRs of stmts in NODE. */
2242 static bool
2243 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2245 /* We vectorize from the first scalar stmt in the node unless
2246 the node is permuted in which case we start from the first
2247 element in the group. */
2248 gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
2249 data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2250 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2251 first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
2253 data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
2254 if (! vect_compute_data_ref_alignment (dr)
2255 /* For creating the data-ref pointer we need alignment of the
2256 first element anyway. */
2257 || (dr != first_dr
2258 && ! vect_compute_data_ref_alignment (first_dr))
2259 || ! verify_data_ref_alignment (dr))
2261 if (dump_enabled_p ())
2262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2263 "not vectorized: bad data alignment in basic "
2264 "block.\n");
2265 return false;
2268 return true;
2271 /* Function vect_slp_analyze_instance_alignment
2273 Analyze the alignment of the data-references in the SLP instance.
2274 Return FALSE if a data reference is found that cannot be vectorized. */
2276 bool
2277 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2279 if (dump_enabled_p ())
2280 dump_printf_loc (MSG_NOTE, vect_location,
2281 "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
2283 slp_tree node;
2284 unsigned i;
2285 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2286 if (! vect_slp_analyze_and_verify_node_alignment (node))
2287 return false;
2289 node = SLP_INSTANCE_TREE (instance);
2290 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
2291 && ! vect_slp_analyze_and_verify_node_alignment
2292 (SLP_INSTANCE_TREE (instance)))
2293 return false;
2295 return true;
2299 /* Analyze groups of accesses: check that DR belongs to a group of
2300 accesses of legal size, step, etc. Detect gaps, single element
2301 interleaving, and other special cases. Set grouped access info.
2302 Collect groups of strided stores for further use in SLP analysis.
2303 Worker for vect_analyze_group_access. */
2305 static bool
2306 vect_analyze_group_access_1 (struct data_reference *dr)
2308 tree step = DR_STEP (dr);
2309 tree scalar_type = TREE_TYPE (DR_REF (dr));
2310 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2311 gimple *stmt = DR_STMT (dr);
2312 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2313 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2314 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2315 HOST_WIDE_INT dr_step = -1;
2316 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2317 bool slp_impossible = false;
2319 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2320 size of the interleaving group (including gaps). */
2321 if (tree_fits_shwi_p (step))
2323 dr_step = tree_to_shwi (step);
2324 /* Check that STEP is a multiple of type size. Otherwise there is
2325 a non-element-sized gap at the end of the group which we
2326 cannot represent in GROUP_GAP or GROUP_SIZE.
2327 ??? As we can handle non-constant step fine here we should
2328 simply remove uses of GROUP_GAP between the last and first
2329 element and instead rely on DR_STEP. GROUP_SIZE then would
2330 simply not include that gap. */
2331 if ((dr_step % type_size) != 0)
2333 if (dump_enabled_p ())
2335 dump_printf_loc (MSG_NOTE, vect_location,
2336 "Step ");
2337 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2338 dump_printf (MSG_NOTE,
2339 " is not a multiple of the element size for ");
2340 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2341 dump_printf (MSG_NOTE, "\n");
2343 return false;
2345 groupsize = absu_hwi (dr_step) / type_size;
2347 else
2348 groupsize = 0;
2350 /* Not consecutive access is possible only if it is a part of interleaving. */
2351 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2353 /* Check if it this DR is a part of interleaving, and is a single
2354 element of the group that is accessed in the loop. */
2356 /* Gaps are supported only for loads. STEP must be a multiple of the type
2357 size. The size of the group must be a power of 2. */
2358 if (DR_IS_READ (dr)
2359 && (dr_step % type_size) == 0
2360 && groupsize > 0
2361 && pow2p_hwi (groupsize))
2363 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2364 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2365 GROUP_GAP (stmt_info) = groupsize - 1;
2366 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_NOTE, vect_location,
2369 "Detected single element interleaving ");
2370 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2371 dump_printf (MSG_NOTE, " step ");
2372 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2373 dump_printf (MSG_NOTE, "\n");
2376 return true;
2379 if (dump_enabled_p ())
2381 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2382 "not consecutive access ");
2383 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2386 if (bb_vinfo)
2388 /* Mark the statement as unvectorizable. */
2389 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2390 return true;
2393 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2394 STMT_VINFO_STRIDED_P (stmt_info) = true;
2395 return true;
2398 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2400 /* First stmt in the interleaving chain. Check the chain. */
2401 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2402 struct data_reference *data_ref = dr;
2403 unsigned int count = 1;
2404 tree prev_init = DR_INIT (data_ref);
2405 gimple *prev = stmt;
2406 HOST_WIDE_INT diff, gaps = 0;
2408 while (next)
2410 /* Skip same data-refs. In case that two or more stmts share
2411 data-ref (supported only for loads), we vectorize only the first
2412 stmt, and the rest get their vectorized loads from the first
2413 one. */
2414 if (!tree_int_cst_compare (DR_INIT (data_ref),
2415 DR_INIT (STMT_VINFO_DATA_REF (
2416 vinfo_for_stmt (next)))))
2418 if (DR_IS_WRITE (data_ref))
2420 if (dump_enabled_p ())
2421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2422 "Two store stmts share the same dr.\n");
2423 return false;
2426 if (dump_enabled_p ())
2427 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2428 "Two or more load stmts share the same dr.\n");
2430 /* For load use the same data-ref load. */
2431 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2433 prev = next;
2434 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2435 continue;
2438 prev = next;
2439 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2441 /* All group members have the same STEP by construction. */
2442 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2444 /* Check that the distance between two accesses is equal to the type
2445 size. Otherwise, we have gaps. */
2446 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2447 - TREE_INT_CST_LOW (prev_init)) / type_size;
2448 if (diff != 1)
2450 /* FORNOW: SLP of accesses with gaps is not supported. */
2451 slp_impossible = true;
2452 if (DR_IS_WRITE (data_ref))
2454 if (dump_enabled_p ())
2455 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2456 "interleaved store with gaps\n");
2457 return false;
2460 gaps += diff - 1;
2463 last_accessed_element += diff;
2465 /* Store the gap from the previous member of the group. If there is no
2466 gap in the access, GROUP_GAP is always 1. */
2467 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2469 prev_init = DR_INIT (data_ref);
2470 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2471 /* Count the number of data-refs in the chain. */
2472 count++;
2475 if (groupsize == 0)
2476 groupsize = count + gaps;
2478 /* This could be UINT_MAX but as we are generating code in a very
2479 inefficient way we have to cap earlier. See PR78699 for example. */
2480 if (groupsize > 4096)
2482 if (dump_enabled_p ())
2483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2484 "group is too large\n");
2485 return false;
2488 /* Check that the size of the interleaving is equal to count for stores,
2489 i.e., that there are no gaps. */
2490 if (groupsize != count
2491 && !DR_IS_READ (dr))
2493 if (dump_enabled_p ())
2494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2495 "interleaved store with gaps\n");
2496 return false;
2499 /* If there is a gap after the last load in the group it is the
2500 difference between the groupsize and the last accessed
2501 element.
2502 When there is no gap, this difference should be 0. */
2503 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2505 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2506 if (dump_enabled_p ())
2508 dump_printf_loc (MSG_NOTE, vect_location,
2509 "Detected interleaving ");
2510 if (DR_IS_READ (dr))
2511 dump_printf (MSG_NOTE, "load ");
2512 else
2513 dump_printf (MSG_NOTE, "store ");
2514 dump_printf (MSG_NOTE, "of size %u starting with ",
2515 (unsigned)groupsize);
2516 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2517 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2518 dump_printf_loc (MSG_NOTE, vect_location,
2519 "There is a gap of %u elements after the group\n",
2520 GROUP_GAP (vinfo_for_stmt (stmt)));
2523 /* SLP: create an SLP data structure for every interleaving group of
2524 stores for further analysis in vect_analyse_slp. */
2525 if (DR_IS_WRITE (dr) && !slp_impossible)
2527 if (loop_vinfo)
2528 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2529 if (bb_vinfo)
2530 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2534 return true;
2537 /* Analyze groups of accesses: check that DR belongs to a group of
2538 accesses of legal size, step, etc. Detect gaps, single element
2539 interleaving, and other special cases. Set grouped access info.
2540 Collect groups of strided stores for further use in SLP analysis. */
2542 static bool
2543 vect_analyze_group_access (struct data_reference *dr)
2545 if (!vect_analyze_group_access_1 (dr))
2547 /* Dissolve the group if present. */
2548 gimple *next;
2549 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2550 while (stmt)
2552 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2553 next = GROUP_NEXT_ELEMENT (vinfo);
2554 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2555 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2556 stmt = next;
2558 return false;
2560 return true;
2563 /* Analyze the access pattern of the data-reference DR.
2564 In case of non-consecutive accesses call vect_analyze_group_access() to
2565 analyze groups of accesses. */
2567 static bool
2568 vect_analyze_data_ref_access (struct data_reference *dr)
2570 tree step = DR_STEP (dr);
2571 tree scalar_type = TREE_TYPE (DR_REF (dr));
2572 gimple *stmt = DR_STMT (dr);
2573 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2574 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2575 struct loop *loop = NULL;
2577 if (loop_vinfo)
2578 loop = LOOP_VINFO_LOOP (loop_vinfo);
2580 if (loop_vinfo && !step)
2582 if (dump_enabled_p ())
2583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2584 "bad data-ref access in loop\n");
2585 return false;
2588 /* Allow loads with zero step in inner-loop vectorization. */
2589 if (loop_vinfo && integer_zerop (step))
2591 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2592 if (!nested_in_vect_loop_p (loop, stmt))
2593 return DR_IS_READ (dr);
2594 /* Allow references with zero step for outer loops marked
2595 with pragma omp simd only - it guarantees absence of
2596 loop-carried dependencies between inner loop iterations. */
2597 if (!loop->force_vectorize)
2599 if (dump_enabled_p ())
2600 dump_printf_loc (MSG_NOTE, vect_location,
2601 "zero step in inner loop of nest\n");
2602 return false;
2606 if (loop && nested_in_vect_loop_p (loop, stmt))
2608 /* Interleaved accesses are not yet supported within outer-loop
2609 vectorization for references in the inner-loop. */
2610 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2612 /* For the rest of the analysis we use the outer-loop step. */
2613 step = STMT_VINFO_DR_STEP (stmt_info);
2614 if (integer_zerop (step))
2616 if (dump_enabled_p ())
2617 dump_printf_loc (MSG_NOTE, vect_location,
2618 "zero step in outer loop.\n");
2619 return DR_IS_READ (dr);
2623 /* Consecutive? */
2624 if (TREE_CODE (step) == INTEGER_CST)
2626 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2627 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2628 || (dr_step < 0
2629 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2631 /* Mark that it is not interleaving. */
2632 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2633 return true;
2637 if (loop && nested_in_vect_loop_p (loop, stmt))
2639 if (dump_enabled_p ())
2640 dump_printf_loc (MSG_NOTE, vect_location,
2641 "grouped access in outer loop.\n");
2642 return false;
2646 /* Assume this is a DR handled by non-constant strided load case. */
2647 if (TREE_CODE (step) != INTEGER_CST)
2648 return (STMT_VINFO_STRIDED_P (stmt_info)
2649 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2650 || vect_analyze_group_access (dr)));
2652 /* Not consecutive access - check if it's a part of interleaving group. */
2653 return vect_analyze_group_access (dr);
2656 /* Compare two data-references DRA and DRB to group them into chunks
2657 suitable for grouping. */
2659 static int
2660 dr_group_sort_cmp (const void *dra_, const void *drb_)
2662 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2663 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2664 int cmp;
2666 /* Stabilize sort. */
2667 if (dra == drb)
2668 return 0;
2670 /* DRs in different loops never belong to the same group. */
2671 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2672 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2673 if (loopa != loopb)
2674 return loopa->num < loopb->num ? -1 : 1;
2676 /* Ordering of DRs according to base. */
2677 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2679 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2680 DR_BASE_ADDRESS (drb));
2681 if (cmp != 0)
2682 return cmp;
2685 /* And according to DR_OFFSET. */
2686 if (!dr_equal_offsets_p (dra, drb))
2688 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2689 if (cmp != 0)
2690 return cmp;
2693 /* Put reads before writes. */
2694 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2695 return DR_IS_READ (dra) ? -1 : 1;
2697 /* Then sort after access size. */
2698 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2699 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2701 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2702 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2703 if (cmp != 0)
2704 return cmp;
2707 /* And after step. */
2708 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2710 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2711 if (cmp != 0)
2712 return cmp;
2715 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2716 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2717 if (cmp == 0)
2718 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2719 return cmp;
2722 /* Function vect_analyze_data_ref_accesses.
2724 Analyze the access pattern of all the data references in the loop.
2726 FORNOW: the only access pattern that is considered vectorizable is a
2727 simple step 1 (consecutive) access.
2729 FORNOW: handle only arrays and pointer accesses. */
2731 bool
2732 vect_analyze_data_ref_accesses (vec_info *vinfo)
2734 unsigned int i;
2735 vec<data_reference_p> datarefs = vinfo->datarefs;
2736 struct data_reference *dr;
2738 if (dump_enabled_p ())
2739 dump_printf_loc (MSG_NOTE, vect_location,
2740 "=== vect_analyze_data_ref_accesses ===\n");
2742 if (datarefs.is_empty ())
2743 return true;
2745 /* Sort the array of datarefs to make building the interleaving chains
2746 linear. Don't modify the original vector's order, it is needed for
2747 determining what dependencies are reversed. */
2748 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2749 datarefs_copy.qsort (dr_group_sort_cmp);
2751 /* Build the interleaving chains. */
2752 for (i = 0; i < datarefs_copy.length () - 1;)
2754 data_reference_p dra = datarefs_copy[i];
2755 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2756 stmt_vec_info lastinfo = NULL;
2757 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_a))
2759 ++i;
2760 continue;
2762 for (i = i + 1; i < datarefs_copy.length (); ++i)
2764 data_reference_p drb = datarefs_copy[i];
2765 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2766 if (! STMT_VINFO_VECTORIZABLE (stmtinfo_b))
2767 break;
2769 /* ??? Imperfect sorting (non-compatible types, non-modulo
2770 accesses, same accesses) can lead to a group to be artificially
2771 split here as we don't just skip over those. If it really
2772 matters we can push those to a worklist and re-iterate
2773 over them. The we can just skip ahead to the next DR here. */
2775 /* DRs in a different loop should not be put into the same
2776 interleaving group. */
2777 if (gimple_bb (DR_STMT (dra))->loop_father
2778 != gimple_bb (DR_STMT (drb))->loop_father)
2779 break;
2781 /* Check that the data-refs have same first location (except init)
2782 and they are both either store or load (not load and store,
2783 not masked loads or stores). */
2784 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2785 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2786 DR_BASE_ADDRESS (drb), 0)
2787 || !dr_equal_offsets_p (dra, drb)
2788 || !gimple_assign_single_p (DR_STMT (dra))
2789 || !gimple_assign_single_p (DR_STMT (drb)))
2790 break;
2792 /* Check that the data-refs have the same constant size. */
2793 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2794 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2795 if (!tree_fits_uhwi_p (sza)
2796 || !tree_fits_uhwi_p (szb)
2797 || !tree_int_cst_equal (sza, szb))
2798 break;
2800 /* Check that the data-refs have the same step. */
2801 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2802 break;
2804 /* Do not place the same access in the interleaving chain twice. */
2805 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2806 break;
2808 /* Check the types are compatible.
2809 ??? We don't distinguish this during sorting. */
2810 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2811 TREE_TYPE (DR_REF (drb))))
2812 break;
2814 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2815 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2816 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2817 gcc_assert (init_a <= init_b);
2819 /* If init_b == init_a + the size of the type * k, we have an
2820 interleaving, and DRA is accessed before DRB. */
2821 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2822 if (type_size_a == 0
2823 || (init_b - init_a) % type_size_a != 0)
2824 break;
2826 /* If we have a store, the accesses are adjacent. This splits
2827 groups into chunks we support (we don't support vectorization
2828 of stores with gaps). */
2829 if (!DR_IS_READ (dra)
2830 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2831 (DR_INIT (datarefs_copy[i-1]))
2832 != type_size_a))
2833 break;
2835 /* If the step (if not zero or non-constant) is greater than the
2836 difference between data-refs' inits this splits groups into
2837 suitable sizes. */
2838 if (tree_fits_shwi_p (DR_STEP (dra)))
2840 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2841 if (step != 0 && step <= (init_b - init_a))
2842 break;
2845 if (dump_enabled_p ())
2847 dump_printf_loc (MSG_NOTE, vect_location,
2848 "Detected interleaving ");
2849 if (DR_IS_READ (dra))
2850 dump_printf (MSG_NOTE, "load ");
2851 else
2852 dump_printf (MSG_NOTE, "store ");
2853 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2854 dump_printf (MSG_NOTE, " and ");
2855 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2856 dump_printf (MSG_NOTE, "\n");
2859 /* Link the found element into the group list. */
2860 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2862 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2863 lastinfo = stmtinfo_a;
2865 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2866 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2867 lastinfo = stmtinfo_b;
2871 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2872 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2873 && !vect_analyze_data_ref_access (dr))
2875 if (dump_enabled_p ())
2876 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2877 "not vectorized: complicated access pattern.\n");
2879 if (is_a <bb_vec_info> (vinfo))
2881 /* Mark the statement as not vectorizable. */
2882 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2883 continue;
2885 else
2887 datarefs_copy.release ();
2888 return false;
2892 datarefs_copy.release ();
2893 return true;
2896 /* Function vect_vfa_segment_size.
2898 Create an expression that computes the size of segment
2899 that will be accessed for a data reference. The functions takes into
2900 account that realignment loads may access one more vector.
2902 Input:
2903 DR: The data reference.
2904 LENGTH_FACTOR: segment length to consider.
2906 Return an expression whose value is the size of segment which will be
2907 accessed by DR. */
2909 static tree
2910 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2912 tree segment_length;
2914 if (integer_zerop (DR_STEP (dr)))
2915 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2916 else
2917 segment_length = size_binop (MULT_EXPR,
2918 fold_convert (sizetype, DR_STEP (dr)),
2919 fold_convert (sizetype, length_factor));
2921 if (vect_supportable_dr_alignment (dr, false)
2922 == dr_explicit_realign_optimized)
2924 tree vector_size = TYPE_SIZE_UNIT
2925 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2927 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2929 return segment_length;
2932 /* Function vect_no_alias_p.
2934 Given data references A and B with equal base and offset, the alias
2935 relation can be decided at compilation time, return TRUE if they do
2936 not alias to each other; return FALSE otherwise. SEGMENT_LENGTH_A
2937 and SEGMENT_LENGTH_B are the memory lengths accessed by A and B
2938 respectively. */
2940 static bool
2941 vect_no_alias_p (struct data_reference *a, struct data_reference *b,
2942 tree segment_length_a, tree segment_length_b)
2944 gcc_assert (TREE_CODE (DR_INIT (a)) == INTEGER_CST
2945 && TREE_CODE (DR_INIT (b)) == INTEGER_CST);
2946 if (tree_int_cst_equal (DR_INIT (a), DR_INIT (b)))
2947 return false;
2949 tree seg_a_min = DR_INIT (a);
2950 tree seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_min),
2951 seg_a_min, segment_length_a);
2952 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
2953 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
2954 [a, a+12) */
2955 if (tree_int_cst_compare (DR_STEP (a), size_zero_node) < 0)
2957 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (a)));
2958 seg_a_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_a_max),
2959 seg_a_max, unit_size);
2960 seg_a_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (a)),
2961 DR_INIT (a), unit_size);
2963 tree seg_b_min = DR_INIT (b);
2964 tree seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_min),
2965 seg_b_min, segment_length_b);
2966 if (tree_int_cst_compare (DR_STEP (b), size_zero_node) < 0)
2968 tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (b)));
2969 seg_b_min = fold_build2 (PLUS_EXPR, TREE_TYPE (seg_b_max),
2970 seg_b_max, unit_size);
2971 seg_b_max = fold_build2 (PLUS_EXPR, TREE_TYPE (DR_INIT (b)),
2972 DR_INIT (b), unit_size);
2975 if (tree_int_cst_le (seg_a_max, seg_b_min)
2976 || tree_int_cst_le (seg_b_max, seg_a_min))
2977 return true;
2979 return false;
2982 /* Function vect_prune_runtime_alias_test_list.
2984 Prune a list of ddrs to be tested at run-time by versioning for alias.
2985 Merge several alias checks into one if possible.
2986 Return FALSE if resulting list of ddrs is longer then allowed by
2987 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2989 bool
2990 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2992 vec<ddr_p> may_alias_ddrs =
2993 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2994 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2995 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2996 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2997 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2999 ddr_p ddr;
3000 unsigned int i;
3001 tree length_factor;
3003 if (dump_enabled_p ())
3004 dump_printf_loc (MSG_NOTE, vect_location,
3005 "=== vect_prune_runtime_alias_test_list ===\n");
3007 if (may_alias_ddrs.is_empty ())
3008 return true;
3010 comp_alias_ddrs.create (may_alias_ddrs.length ());
3012 /* First, we collect all data ref pairs for aliasing checks. */
3013 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3015 int comp_res;
3016 struct data_reference *dr_a, *dr_b;
3017 gimple *dr_group_first_a, *dr_group_first_b;
3018 tree segment_length_a, segment_length_b;
3019 gimple *stmt_a, *stmt_b;
3021 dr_a = DDR_A (ddr);
3022 stmt_a = DR_STMT (DDR_A (ddr));
3023 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
3024 if (dr_group_first_a)
3026 stmt_a = dr_group_first_a;
3027 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
3030 dr_b = DDR_B (ddr);
3031 stmt_b = DR_STMT (DDR_B (ddr));
3032 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
3033 if (dr_group_first_b)
3035 stmt_b = dr_group_first_b;
3036 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
3039 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
3040 length_factor = scalar_loop_iters;
3041 else
3042 length_factor = size_int (vect_factor);
3043 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
3044 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
3046 comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
3047 DR_BASE_ADDRESS (dr_b));
3048 if (comp_res == 0)
3049 comp_res = data_ref_compare_tree (DR_OFFSET (dr_a),
3050 DR_OFFSET (dr_b));
3052 /* Alias is known at compilation time. */
3053 if (comp_res == 0
3054 && TREE_CODE (DR_STEP (dr_a)) == INTEGER_CST
3055 && TREE_CODE (DR_STEP (dr_b)) == INTEGER_CST
3056 && TREE_CODE (segment_length_a) == INTEGER_CST
3057 && TREE_CODE (segment_length_b) == INTEGER_CST)
3059 if (vect_no_alias_p (dr_a, dr_b, segment_length_a, segment_length_b))
3060 continue;
3062 if (dump_enabled_p ())
3063 dump_printf_loc (MSG_NOTE, vect_location,
3064 "not vectorized: compilation time alias.\n");
3066 return false;
3069 dr_with_seg_len_pair_t dr_with_seg_len_pair
3070 (dr_with_seg_len (dr_a, segment_length_a),
3071 dr_with_seg_len (dr_b, segment_length_b));
3073 /* Canonicalize pairs by sorting the two DR members. */
3074 if (comp_res > 0)
3075 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3077 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3080 prune_runtime_alias_test_list (&comp_alias_ddrs,
3081 (unsigned HOST_WIDE_INT) vect_factor);
3082 dump_printf_loc (MSG_NOTE, vect_location,
3083 "improved number of alias checks from %d to %d\n",
3084 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3085 if ((int) comp_alias_ddrs.length () >
3086 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3088 if (dump_enabled_p ())
3089 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3090 "number of versioning for alias "
3091 "run-time tests exceeds %d "
3092 "(--param vect-max-version-for-alias-checks)\n",
3093 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3094 return false;
3097 /* All alias checks have been resolved at compilation time. */
3098 if (!comp_alias_ddrs.length ())
3099 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
3101 return true;
3104 /* Return true if a non-affine read or write in STMT is suitable for a
3105 gather load or scatter store. Describe the operation in *INFO if so. */
3107 bool
3108 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo,
3109 gather_scatter_info *info)
3111 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3112 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3113 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3114 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3115 tree offtype = NULL_TREE;
3116 tree decl, base, off;
3117 machine_mode pmode;
3118 int punsignedp, reversep, pvolatilep = 0;
3120 base = DR_REF (dr);
3121 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3122 see if we can use the def stmt of the address. */
3123 if (is_gimple_call (stmt)
3124 && gimple_call_internal_p (stmt)
3125 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3126 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3127 && TREE_CODE (base) == MEM_REF
3128 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3129 && integer_zerop (TREE_OPERAND (base, 1))
3130 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3132 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3133 if (is_gimple_assign (def_stmt)
3134 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3135 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3138 /* The gather and scatter builtins need address of the form
3139 loop_invariant + vector * {1, 2, 4, 8}
3141 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3142 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3143 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3144 multiplications and additions in it. To get a vector, we need
3145 a single SSA_NAME that will be defined in the loop and will
3146 contain everything that is not loop invariant and that can be
3147 vectorized. The following code attempts to find such a preexistng
3148 SSA_NAME OFF and put the loop invariants into a tree BASE
3149 that can be gimplified before the loop. */
3150 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3151 &punsignedp, &reversep, &pvolatilep);
3152 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3154 if (TREE_CODE (base) == MEM_REF)
3156 if (!integer_zerop (TREE_OPERAND (base, 1)))
3158 if (off == NULL_TREE)
3160 offset_int moff = mem_ref_offset (base);
3161 off = wide_int_to_tree (sizetype, moff);
3163 else
3164 off = size_binop (PLUS_EXPR, off,
3165 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3167 base = TREE_OPERAND (base, 0);
3169 else
3170 base = build_fold_addr_expr (base);
3172 if (off == NULL_TREE)
3173 off = size_zero_node;
3175 /* If base is not loop invariant, either off is 0, then we start with just
3176 the constant offset in the loop invariant BASE and continue with base
3177 as OFF, otherwise give up.
3178 We could handle that case by gimplifying the addition of base + off
3179 into some SSA_NAME and use that as off, but for now punt. */
3180 if (!expr_invariant_in_loop_p (loop, base))
3182 if (!integer_zerop (off))
3183 return false;
3184 off = base;
3185 base = size_int (pbitpos / BITS_PER_UNIT);
3187 /* Otherwise put base + constant offset into the loop invariant BASE
3188 and continue with OFF. */
3189 else
3191 base = fold_convert (sizetype, base);
3192 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3195 /* OFF at this point may be either a SSA_NAME or some tree expression
3196 from get_inner_reference. Try to peel off loop invariants from it
3197 into BASE as long as possible. */
3198 STRIP_NOPS (off);
3199 while (offtype == NULL_TREE)
3201 enum tree_code code;
3202 tree op0, op1, add = NULL_TREE;
3204 if (TREE_CODE (off) == SSA_NAME)
3206 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3208 if (expr_invariant_in_loop_p (loop, off))
3209 return false;
3211 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3212 break;
3214 op0 = gimple_assign_rhs1 (def_stmt);
3215 code = gimple_assign_rhs_code (def_stmt);
3216 op1 = gimple_assign_rhs2 (def_stmt);
3218 else
3220 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3221 return false;
3222 code = TREE_CODE (off);
3223 extract_ops_from_tree (off, &code, &op0, &op1);
3225 switch (code)
3227 case POINTER_PLUS_EXPR:
3228 case PLUS_EXPR:
3229 if (expr_invariant_in_loop_p (loop, op0))
3231 add = op0;
3232 off = op1;
3233 do_add:
3234 add = fold_convert (sizetype, add);
3235 if (scale != 1)
3236 add = size_binop (MULT_EXPR, add, size_int (scale));
3237 base = size_binop (PLUS_EXPR, base, add);
3238 continue;
3240 if (expr_invariant_in_loop_p (loop, op1))
3242 add = op1;
3243 off = op0;
3244 goto do_add;
3246 break;
3247 case MINUS_EXPR:
3248 if (expr_invariant_in_loop_p (loop, op1))
3250 add = fold_convert (sizetype, op1);
3251 add = size_binop (MINUS_EXPR, size_zero_node, add);
3252 off = op0;
3253 goto do_add;
3255 break;
3256 case MULT_EXPR:
3257 if (scale == 1 && tree_fits_shwi_p (op1))
3259 scale = tree_to_shwi (op1);
3260 off = op0;
3261 continue;
3263 break;
3264 case SSA_NAME:
3265 off = op0;
3266 continue;
3267 CASE_CONVERT:
3268 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3269 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3270 break;
3271 if (TYPE_PRECISION (TREE_TYPE (op0))
3272 == TYPE_PRECISION (TREE_TYPE (off)))
3274 off = op0;
3275 continue;
3277 if (TYPE_PRECISION (TREE_TYPE (op0))
3278 < TYPE_PRECISION (TREE_TYPE (off)))
3280 off = op0;
3281 offtype = TREE_TYPE (off);
3282 STRIP_NOPS (off);
3283 continue;
3285 break;
3286 default:
3287 break;
3289 break;
3292 /* If at the end OFF still isn't a SSA_NAME or isn't
3293 defined in the loop, punt. */
3294 if (TREE_CODE (off) != SSA_NAME
3295 || expr_invariant_in_loop_p (loop, off))
3296 return false;
3298 if (offtype == NULL_TREE)
3299 offtype = TREE_TYPE (off);
3301 if (DR_IS_READ (dr))
3302 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3303 offtype, scale);
3304 else
3305 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3306 offtype, scale);
3308 if (decl == NULL_TREE)
3309 return false;
3311 info->decl = decl;
3312 info->base = base;
3313 info->offset = off;
3314 info->offset_dt = vect_unknown_def_type;
3315 info->offset_vectype = NULL_TREE;
3316 info->scale = scale;
3317 return true;
3320 /* Function vect_analyze_data_refs.
3322 Find all the data references in the loop or basic block.
3324 The general structure of the analysis of data refs in the vectorizer is as
3325 follows:
3326 1- vect_analyze_data_refs(loop/bb): call
3327 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3328 in the loop/bb and their dependences.
3329 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3330 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3331 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3335 bool
3336 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3338 struct loop *loop = NULL;
3339 unsigned int i;
3340 struct data_reference *dr;
3341 tree scalar_type;
3343 if (dump_enabled_p ())
3344 dump_printf_loc (MSG_NOTE, vect_location,
3345 "=== vect_analyze_data_refs ===\n");
3347 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3348 loop = LOOP_VINFO_LOOP (loop_vinfo);
3350 /* Go through the data-refs, check that the analysis succeeded. Update
3351 pointer from stmt_vec_info struct to DR and vectype. */
3353 vec<data_reference_p> datarefs = vinfo->datarefs;
3354 FOR_EACH_VEC_ELT (datarefs, i, dr)
3356 gimple *stmt;
3357 stmt_vec_info stmt_info;
3358 tree base, offset, init;
3359 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3360 bool simd_lane_access = false;
3361 int vf;
3363 again:
3364 if (!dr || !DR_REF (dr))
3366 if (dump_enabled_p ())
3367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3368 "not vectorized: unhandled data-ref\n");
3369 return false;
3372 stmt = DR_STMT (dr);
3373 stmt_info = vinfo_for_stmt (stmt);
3375 /* Discard clobbers from the dataref vector. We will remove
3376 clobber stmts during vectorization. */
3377 if (gimple_clobber_p (stmt))
3379 free_data_ref (dr);
3380 if (i == datarefs.length () - 1)
3382 datarefs.pop ();
3383 break;
3385 datarefs.ordered_remove (i);
3386 dr = datarefs[i];
3387 goto again;
3390 /* Check that analysis of the data-ref succeeded. */
3391 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3392 || !DR_STEP (dr))
3394 bool maybe_gather
3395 = DR_IS_READ (dr)
3396 && !TREE_THIS_VOLATILE (DR_REF (dr))
3397 && targetm.vectorize.builtin_gather != NULL;
3398 bool maybe_scatter
3399 = DR_IS_WRITE (dr)
3400 && !TREE_THIS_VOLATILE (DR_REF (dr))
3401 && targetm.vectorize.builtin_scatter != NULL;
3402 bool maybe_simd_lane_access
3403 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3405 /* If target supports vector gather loads or scatter stores, or if
3406 this might be a SIMD lane access, see if they can't be used. */
3407 if (is_a <loop_vec_info> (vinfo)
3408 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3409 && !nested_in_vect_loop_p (loop, stmt))
3411 struct data_reference *newdr
3412 = create_data_ref (NULL, loop_containing_stmt (stmt),
3413 DR_REF (dr), stmt, maybe_scatter ? false : true);
3414 gcc_assert (newdr != NULL && DR_REF (newdr));
3415 if (DR_BASE_ADDRESS (newdr)
3416 && DR_OFFSET (newdr)
3417 && DR_INIT (newdr)
3418 && DR_STEP (newdr)
3419 && integer_zerop (DR_STEP (newdr)))
3421 if (maybe_simd_lane_access)
3423 tree off = DR_OFFSET (newdr);
3424 STRIP_NOPS (off);
3425 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3426 && TREE_CODE (off) == MULT_EXPR
3427 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3429 tree step = TREE_OPERAND (off, 1);
3430 off = TREE_OPERAND (off, 0);
3431 STRIP_NOPS (off);
3432 if (CONVERT_EXPR_P (off)
3433 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3434 0)))
3435 < TYPE_PRECISION (TREE_TYPE (off)))
3436 off = TREE_OPERAND (off, 0);
3437 if (TREE_CODE (off) == SSA_NAME)
3439 gimple *def = SSA_NAME_DEF_STMT (off);
3440 tree reft = TREE_TYPE (DR_REF (newdr));
3441 if (is_gimple_call (def)
3442 && gimple_call_internal_p (def)
3443 && (gimple_call_internal_fn (def)
3444 == IFN_GOMP_SIMD_LANE))
3446 tree arg = gimple_call_arg (def, 0);
3447 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3448 arg = SSA_NAME_VAR (arg);
3449 if (arg == loop->simduid
3450 /* For now. */
3451 && tree_int_cst_equal
3452 (TYPE_SIZE_UNIT (reft),
3453 step))
3455 DR_OFFSET (newdr) = ssize_int (0);
3456 DR_STEP (newdr) = step;
3457 DR_ALIGNED_TO (newdr)
3458 = size_int (BIGGEST_ALIGNMENT);
3459 dr = newdr;
3460 simd_lane_access = true;
3466 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3468 dr = newdr;
3469 if (maybe_gather)
3470 gatherscatter = GATHER;
3471 else
3472 gatherscatter = SCATTER;
3475 if (gatherscatter == SG_NONE && !simd_lane_access)
3476 free_data_ref (newdr);
3479 if (gatherscatter == SG_NONE && !simd_lane_access)
3481 if (dump_enabled_p ())
3483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3484 "not vectorized: data ref analysis "
3485 "failed ");
3486 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3489 if (is_a <bb_vec_info> (vinfo))
3490 break;
3492 return false;
3496 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3498 if (dump_enabled_p ())
3499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3500 "not vectorized: base addr of dr is a "
3501 "constant\n");
3503 if (is_a <bb_vec_info> (vinfo))
3504 break;
3506 if (gatherscatter != SG_NONE || simd_lane_access)
3507 free_data_ref (dr);
3508 return false;
3511 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3513 if (dump_enabled_p ())
3515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3516 "not vectorized: volatile type ");
3517 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3520 if (is_a <bb_vec_info> (vinfo))
3521 break;
3523 return false;
3526 if (stmt_can_throw_internal (stmt))
3528 if (dump_enabled_p ())
3530 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3531 "not vectorized: statement can throw an "
3532 "exception ");
3533 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3536 if (is_a <bb_vec_info> (vinfo))
3537 break;
3539 if (gatherscatter != SG_NONE || simd_lane_access)
3540 free_data_ref (dr);
3541 return false;
3544 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3545 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3547 if (dump_enabled_p ())
3549 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3550 "not vectorized: statement is bitfield "
3551 "access ");
3552 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3555 if (is_a <bb_vec_info> (vinfo))
3556 break;
3558 if (gatherscatter != SG_NONE || simd_lane_access)
3559 free_data_ref (dr);
3560 return false;
3563 base = unshare_expr (DR_BASE_ADDRESS (dr));
3564 offset = unshare_expr (DR_OFFSET (dr));
3565 init = unshare_expr (DR_INIT (dr));
3567 if (is_gimple_call (stmt)
3568 && (!gimple_call_internal_p (stmt)
3569 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3570 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3572 if (dump_enabled_p ())
3574 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3575 "not vectorized: dr in a call ");
3576 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3579 if (is_a <bb_vec_info> (vinfo))
3580 break;
3582 if (gatherscatter != SG_NONE || simd_lane_access)
3583 free_data_ref (dr);
3584 return false;
3587 /* Update DR field in stmt_vec_info struct. */
3589 /* If the dataref is in an inner-loop of the loop that is considered for
3590 for vectorization, we also want to analyze the access relative to
3591 the outer-loop (DR contains information only relative to the
3592 inner-most enclosing loop). We do that by building a reference to the
3593 first location accessed by the inner-loop, and analyze it relative to
3594 the outer-loop. */
3595 if (loop && nested_in_vect_loop_p (loop, stmt))
3597 tree outer_step, outer_base, outer_init;
3598 HOST_WIDE_INT pbitsize, pbitpos;
3599 tree poffset;
3600 machine_mode pmode;
3601 int punsignedp, preversep, pvolatilep;
3602 affine_iv base_iv, offset_iv;
3603 tree dinit;
3605 /* Build a reference to the first location accessed by the
3606 inner-loop: *(BASE+INIT). (The first location is actually
3607 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3608 tree inner_base = build_fold_indirect_ref
3609 (fold_build_pointer_plus (base, init));
3611 if (dump_enabled_p ())
3613 dump_printf_loc (MSG_NOTE, vect_location,
3614 "analyze in outer-loop: ");
3615 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3616 dump_printf (MSG_NOTE, "\n");
3619 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3620 &poffset, &pmode, &punsignedp,
3621 &preversep, &pvolatilep);
3622 gcc_assert (outer_base != NULL_TREE);
3624 if (pbitpos % BITS_PER_UNIT != 0)
3626 if (dump_enabled_p ())
3627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3628 "failed: bit offset alignment.\n");
3629 return false;
3632 if (preversep)
3634 if (dump_enabled_p ())
3635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3636 "failed: reverse storage order.\n");
3637 return false;
3640 outer_base = build_fold_addr_expr (outer_base);
3641 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3642 &base_iv, false))
3644 if (dump_enabled_p ())
3645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3646 "failed: evolution of base is not affine.\n");
3647 return false;
3650 if (offset)
3652 if (poffset)
3653 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3654 poffset);
3655 else
3656 poffset = offset;
3659 if (!poffset)
3661 offset_iv.base = ssize_int (0);
3662 offset_iv.step = ssize_int (0);
3664 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3665 &offset_iv, false))
3667 if (dump_enabled_p ())
3668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3669 "evolution of offset is not affine.\n");
3670 return false;
3673 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3674 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3675 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3676 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3677 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3679 outer_step = size_binop (PLUS_EXPR,
3680 fold_convert (ssizetype, base_iv.step),
3681 fold_convert (ssizetype, offset_iv.step));
3683 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3684 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3685 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3686 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3687 STMT_VINFO_DR_OFFSET (stmt_info) =
3688 fold_convert (ssizetype, offset_iv.base);
3689 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3690 size_int (highest_pow2_factor (offset_iv.base));
3692 if (dump_enabled_p ())
3694 dump_printf_loc (MSG_NOTE, vect_location,
3695 "\touter base_address: ");
3696 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3697 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3698 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3699 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3700 STMT_VINFO_DR_OFFSET (stmt_info));
3701 dump_printf (MSG_NOTE,
3702 "\n\touter constant offset from base address: ");
3703 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3704 STMT_VINFO_DR_INIT (stmt_info));
3705 dump_printf (MSG_NOTE, "\n\touter step: ");
3706 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3707 STMT_VINFO_DR_STEP (stmt_info));
3708 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3709 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3710 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3711 dump_printf (MSG_NOTE, "\n");
3715 if (STMT_VINFO_DATA_REF (stmt_info))
3717 if (dump_enabled_p ())
3719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3720 "not vectorized: more than one data ref "
3721 "in stmt: ");
3722 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3725 if (is_a <bb_vec_info> (vinfo))
3726 break;
3728 if (gatherscatter != SG_NONE || simd_lane_access)
3729 free_data_ref (dr);
3730 return false;
3733 STMT_VINFO_DATA_REF (stmt_info) = dr;
3734 if (simd_lane_access)
3736 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3737 free_data_ref (datarefs[i]);
3738 datarefs[i] = dr;
3741 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == ADDR_EXPR
3742 && VAR_P (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0))
3743 && DECL_NONALIASED (TREE_OPERAND (DR_BASE_ADDRESS (dr), 0)))
3745 if (dump_enabled_p ())
3747 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3748 "not vectorized: base object not addressable "
3749 "for stmt: ");
3750 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3752 if (is_a <bb_vec_info> (vinfo))
3754 /* In BB vectorization the ref can still participate
3755 in dependence analysis, we just can't vectorize it. */
3756 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3757 continue;
3759 return false;
3762 /* Set vectype for STMT. */
3763 scalar_type = TREE_TYPE (DR_REF (dr));
3764 STMT_VINFO_VECTYPE (stmt_info)
3765 = get_vectype_for_scalar_type (scalar_type);
3766 if (!STMT_VINFO_VECTYPE (stmt_info))
3768 if (dump_enabled_p ())
3770 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3771 "not vectorized: no vectype for stmt: ");
3772 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3773 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3774 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3775 scalar_type);
3776 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3779 if (is_a <bb_vec_info> (vinfo))
3781 /* No vector type is fine, the ref can still participate
3782 in dependence analysis, we just can't vectorize it. */
3783 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3784 continue;
3787 if (gatherscatter != SG_NONE || simd_lane_access)
3789 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3790 if (gatherscatter != SG_NONE)
3791 free_data_ref (dr);
3793 return false;
3795 else
3797 if (dump_enabled_p ())
3799 dump_printf_loc (MSG_NOTE, vect_location,
3800 "got vectype for stmt: ");
3801 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3802 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3803 STMT_VINFO_VECTYPE (stmt_info));
3804 dump_printf (MSG_NOTE, "\n");
3808 /* Adjust the minimal vectorization factor according to the
3809 vector type. */
3810 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3811 if (vf > *min_vf)
3812 *min_vf = vf;
3814 if (gatherscatter != SG_NONE)
3816 gather_scatter_info gs_info;
3817 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3818 &gs_info)
3819 || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
3821 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3822 free_data_ref (dr);
3823 if (dump_enabled_p ())
3825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3826 (gatherscatter == GATHER) ?
3827 "not vectorized: not suitable for gather "
3828 "load " :
3829 "not vectorized: not suitable for scatter "
3830 "store ");
3831 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3833 return false;
3836 free_data_ref (datarefs[i]);
3837 datarefs[i] = dr;
3838 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3841 else if (is_a <loop_vec_info> (vinfo)
3842 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3844 if (nested_in_vect_loop_p (loop, stmt))
3846 if (dump_enabled_p ())
3848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3849 "not vectorized: not suitable for strided "
3850 "load ");
3851 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3853 return false;
3855 STMT_VINFO_STRIDED_P (stmt_info) = true;
3859 /* If we stopped analysis at the first dataref we could not analyze
3860 when trying to vectorize a basic-block mark the rest of the datarefs
3861 as not vectorizable and truncate the vector of datarefs. That
3862 avoids spending useless time in analyzing their dependence. */
3863 if (i != datarefs.length ())
3865 gcc_assert (is_a <bb_vec_info> (vinfo));
3866 for (unsigned j = i; j < datarefs.length (); ++j)
3868 data_reference_p dr = datarefs[j];
3869 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3870 free_data_ref (dr);
3872 datarefs.truncate (i);
3875 return true;
3879 /* Function vect_get_new_vect_var.
3881 Returns a name for a new variable. The current naming scheme appends the
3882 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3883 the name of vectorizer generated variables, and appends that to NAME if
3884 provided. */
3886 tree
3887 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3889 const char *prefix;
3890 tree new_vect_var;
3892 switch (var_kind)
3894 case vect_simple_var:
3895 prefix = "vect";
3896 break;
3897 case vect_scalar_var:
3898 prefix = "stmp";
3899 break;
3900 case vect_mask_var:
3901 prefix = "mask";
3902 break;
3903 case vect_pointer_var:
3904 prefix = "vectp";
3905 break;
3906 default:
3907 gcc_unreachable ();
3910 if (name)
3912 char* tmp = concat (prefix, "_", name, NULL);
3913 new_vect_var = create_tmp_reg (type, tmp);
3914 free (tmp);
3916 else
3917 new_vect_var = create_tmp_reg (type, prefix);
3919 return new_vect_var;
3922 /* Like vect_get_new_vect_var but return an SSA name. */
3924 tree
3925 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3927 const char *prefix;
3928 tree new_vect_var;
3930 switch (var_kind)
3932 case vect_simple_var:
3933 prefix = "vect";
3934 break;
3935 case vect_scalar_var:
3936 prefix = "stmp";
3937 break;
3938 case vect_pointer_var:
3939 prefix = "vectp";
3940 break;
3941 default:
3942 gcc_unreachable ();
3945 if (name)
3947 char* tmp = concat (prefix, "_", name, NULL);
3948 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3949 free (tmp);
3951 else
3952 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3954 return new_vect_var;
3957 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3959 static void
3960 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3961 stmt_vec_info stmt_info)
3963 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3964 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3965 int misalign = DR_MISALIGNMENT (dr);
3966 if (misalign == DR_MISALIGNMENT_UNKNOWN)
3967 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3968 else
3969 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3972 /* Function vect_create_addr_base_for_vector_ref.
3974 Create an expression that computes the address of the first memory location
3975 that will be accessed for a data reference.
3977 Input:
3978 STMT: The statement containing the data reference.
3979 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3980 OFFSET: Optional. If supplied, it is be added to the initial address.
3981 LOOP: Specify relative to which loop-nest should the address be computed.
3982 For example, when the dataref is in an inner-loop nested in an
3983 outer-loop that is now being vectorized, LOOP can be either the
3984 outer-loop, or the inner-loop. The first memory location accessed
3985 by the following dataref ('in' points to short):
3987 for (i=0; i<N; i++)
3988 for (j=0; j<M; j++)
3989 s += in[i+j]
3991 is as follows:
3992 if LOOP=i_loop: &in (relative to i_loop)
3993 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3994 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3995 initial address. Unlike OFFSET, which is number of elements to
3996 be added, BYTE_OFFSET is measured in bytes.
3998 Output:
3999 1. Return an SSA_NAME whose value is the address of the memory location of
4000 the first vector of the data reference.
4001 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4002 these statement(s) which define the returned SSA_NAME.
4004 FORNOW: We are only handling array accesses with step 1. */
4006 tree
4007 vect_create_addr_base_for_vector_ref (gimple *stmt,
4008 gimple_seq *new_stmt_list,
4009 tree offset,
4010 struct loop *loop,
4011 tree byte_offset)
4013 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4014 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4015 tree data_ref_base;
4016 const char *base_name;
4017 tree addr_base;
4018 tree dest;
4019 gimple_seq seq = NULL;
4020 tree base_offset;
4021 tree init;
4022 tree vect_ptr_type;
4023 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4024 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4026 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4028 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4030 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4032 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4033 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4034 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4036 else
4038 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4039 base_offset = unshare_expr (DR_OFFSET (dr));
4040 init = unshare_expr (DR_INIT (dr));
4043 if (loop_vinfo)
4044 base_name = get_name (data_ref_base);
4045 else
4047 base_offset = ssize_int (0);
4048 init = ssize_int (0);
4049 base_name = get_name (DR_REF (dr));
4052 /* Create base_offset */
4053 base_offset = size_binop (PLUS_EXPR,
4054 fold_convert (sizetype, base_offset),
4055 fold_convert (sizetype, init));
4057 if (offset)
4059 offset = fold_build2 (MULT_EXPR, sizetype,
4060 fold_convert (sizetype, offset), step);
4061 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4062 base_offset, offset);
4064 if (byte_offset)
4066 byte_offset = fold_convert (sizetype, byte_offset);
4067 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4068 base_offset, byte_offset);
4071 /* base + base_offset */
4072 if (loop_vinfo)
4073 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4074 else
4076 addr_base = build1 (ADDR_EXPR,
4077 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4078 unshare_expr (DR_REF (dr)));
4081 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4082 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4083 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4084 gimple_seq_add_seq (new_stmt_list, seq);
4086 if (DR_PTR_INFO (dr)
4087 && TREE_CODE (addr_base) == SSA_NAME
4088 && !SSA_NAME_PTR_INFO (addr_base))
4090 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4091 if (offset || byte_offset)
4092 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4095 if (dump_enabled_p ())
4097 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4098 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4099 dump_printf (MSG_NOTE, "\n");
4102 return addr_base;
4106 /* Function vect_create_data_ref_ptr.
4108 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4109 location accessed in the loop by STMT, along with the def-use update
4110 chain to appropriately advance the pointer through the loop iterations.
4111 Also set aliasing information for the pointer. This pointer is used by
4112 the callers to this function to create a memory reference expression for
4113 vector load/store access.
4115 Input:
4116 1. STMT: a stmt that references memory. Expected to be of the form
4117 GIMPLE_ASSIGN <name, data-ref> or
4118 GIMPLE_ASSIGN <data-ref, name>.
4119 2. AGGR_TYPE: the type of the reference, which should be either a vector
4120 or an array.
4121 3. AT_LOOP: the loop where the vector memref is to be created.
4122 4. OFFSET (optional): an offset to be added to the initial address accessed
4123 by the data-ref in STMT.
4124 5. BSI: location where the new stmts are to be placed if there is no loop
4125 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4126 pointing to the initial address.
4127 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4128 to the initial address accessed by the data-ref in STMT. This is
4129 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4130 in bytes.
4132 Output:
4133 1. Declare a new ptr to vector_type, and have it point to the base of the
4134 data reference (initial addressed accessed by the data reference).
4135 For example, for vector of type V8HI, the following code is generated:
4137 v8hi *ap;
4138 ap = (v8hi *)initial_address;
4140 if OFFSET is not supplied:
4141 initial_address = &a[init];
4142 if OFFSET is supplied:
4143 initial_address = &a[init + OFFSET];
4144 if BYTE_OFFSET is supplied:
4145 initial_address = &a[init] + BYTE_OFFSET;
4147 Return the initial_address in INITIAL_ADDRESS.
4149 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4150 update the pointer in each iteration of the loop.
4152 Return the increment stmt that updates the pointer in PTR_INCR.
4154 3. Set INV_P to true if the access pattern of the data reference in the
4155 vectorized loop is invariant. Set it to false otherwise.
4157 4. Return the pointer. */
4159 tree
4160 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4161 tree offset, tree *initial_address,
4162 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4163 bool only_init, bool *inv_p, tree byte_offset)
4165 const char *base_name;
4166 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4167 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4168 struct loop *loop = NULL;
4169 bool nested_in_vect_loop = false;
4170 struct loop *containing_loop = NULL;
4171 tree aggr_ptr_type;
4172 tree aggr_ptr;
4173 tree new_temp;
4174 gimple_seq new_stmt_list = NULL;
4175 edge pe = NULL;
4176 basic_block new_bb;
4177 tree aggr_ptr_init;
4178 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4179 tree aptr;
4180 gimple_stmt_iterator incr_gsi;
4181 bool insert_after;
4182 tree indx_before_incr, indx_after_incr;
4183 gimple *incr;
4184 tree step;
4185 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4187 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4188 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4190 if (loop_vinfo)
4192 loop = LOOP_VINFO_LOOP (loop_vinfo);
4193 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4194 containing_loop = (gimple_bb (stmt))->loop_father;
4195 pe = loop_preheader_edge (loop);
4197 else
4199 gcc_assert (bb_vinfo);
4200 only_init = true;
4201 *ptr_incr = NULL;
4204 /* Check the step (evolution) of the load in LOOP, and record
4205 whether it's invariant. */
4206 if (nested_in_vect_loop)
4207 step = STMT_VINFO_DR_STEP (stmt_info);
4208 else
4209 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4211 if (integer_zerop (step))
4212 *inv_p = true;
4213 else
4214 *inv_p = false;
4216 /* Create an expression for the first address accessed by this load
4217 in LOOP. */
4218 base_name = get_name (DR_BASE_ADDRESS (dr));
4220 if (dump_enabled_p ())
4222 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4223 dump_printf_loc (MSG_NOTE, vect_location,
4224 "create %s-pointer variable to type: ",
4225 get_tree_code_name (TREE_CODE (aggr_type)));
4226 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4227 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4228 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4229 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4230 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4231 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4232 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4233 else
4234 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4235 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4236 dump_printf (MSG_NOTE, "\n");
4239 /* (1) Create the new aggregate-pointer variable.
4240 Vector and array types inherit the alias set of their component
4241 type by default so we need to use a ref-all pointer if the data
4242 reference does not conflict with the created aggregated data
4243 reference because it is not addressable. */
4244 bool need_ref_all = false;
4245 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4246 get_alias_set (DR_REF (dr))))
4247 need_ref_all = true;
4248 /* Likewise for any of the data references in the stmt group. */
4249 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4251 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4254 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4255 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4256 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4257 get_alias_set (DR_REF (sdr))))
4259 need_ref_all = true;
4260 break;
4262 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4264 while (orig_stmt);
4266 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4267 need_ref_all);
4268 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4271 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4272 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4273 def-use update cycles for the pointer: one relative to the outer-loop
4274 (LOOP), which is what steps (3) and (4) below do. The other is relative
4275 to the inner-loop (which is the inner-most loop containing the dataref),
4276 and this is done be step (5) below.
4278 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4279 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4280 redundant. Steps (3),(4) create the following:
4282 vp0 = &base_addr;
4283 LOOP: vp1 = phi(vp0,vp2)
4286 vp2 = vp1 + step
4287 goto LOOP
4289 If there is an inner-loop nested in loop, then step (5) will also be
4290 applied, and an additional update in the inner-loop will be created:
4292 vp0 = &base_addr;
4293 LOOP: vp1 = phi(vp0,vp2)
4295 inner: vp3 = phi(vp1,vp4)
4296 vp4 = vp3 + inner_step
4297 if () goto inner
4299 vp2 = vp1 + step
4300 if () goto LOOP */
4302 /* (2) Calculate the initial address of the aggregate-pointer, and set
4303 the aggregate-pointer to point to it before the loop. */
4305 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4307 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4308 offset, loop, byte_offset);
4309 if (new_stmt_list)
4311 if (pe)
4313 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4314 gcc_assert (!new_bb);
4316 else
4317 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4320 *initial_address = new_temp;
4321 aggr_ptr_init = new_temp;
4323 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4324 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4325 inner-loop nested in LOOP (during outer-loop vectorization). */
4327 /* No update in loop is required. */
4328 if (only_init && (!loop_vinfo || at_loop == loop))
4329 aptr = aggr_ptr_init;
4330 else
4332 /* The step of the aggregate pointer is the type size. */
4333 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4334 /* One exception to the above is when the scalar step of the load in
4335 LOOP is zero. In this case the step here is also zero. */
4336 if (*inv_p)
4337 iv_step = size_zero_node;
4338 else if (tree_int_cst_sgn (step) == -1)
4339 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4341 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4343 create_iv (aggr_ptr_init,
4344 fold_convert (aggr_ptr_type, iv_step),
4345 aggr_ptr, loop, &incr_gsi, insert_after,
4346 &indx_before_incr, &indx_after_incr);
4347 incr = gsi_stmt (incr_gsi);
4348 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4350 /* Copy the points-to information if it exists. */
4351 if (DR_PTR_INFO (dr))
4353 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4354 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4356 if (ptr_incr)
4357 *ptr_incr = incr;
4359 aptr = indx_before_incr;
4362 if (!nested_in_vect_loop || only_init)
4363 return aptr;
4366 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4367 nested in LOOP, if exists. */
4369 gcc_assert (nested_in_vect_loop);
4370 if (!only_init)
4372 standard_iv_increment_position (containing_loop, &incr_gsi,
4373 &insert_after);
4374 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4375 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4376 &indx_after_incr);
4377 incr = gsi_stmt (incr_gsi);
4378 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4380 /* Copy the points-to information if it exists. */
4381 if (DR_PTR_INFO (dr))
4383 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4384 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4386 if (ptr_incr)
4387 *ptr_incr = incr;
4389 return indx_before_incr;
4391 else
4392 gcc_unreachable ();
4396 /* Function bump_vector_ptr
4398 Increment a pointer (to a vector type) by vector-size. If requested,
4399 i.e. if PTR-INCR is given, then also connect the new increment stmt
4400 to the existing def-use update-chain of the pointer, by modifying
4401 the PTR_INCR as illustrated below:
4403 The pointer def-use update-chain before this function:
4404 DATAREF_PTR = phi (p_0, p_2)
4405 ....
4406 PTR_INCR: p_2 = DATAREF_PTR + step
4408 The pointer def-use update-chain after this function:
4409 DATAREF_PTR = phi (p_0, p_2)
4410 ....
4411 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4412 ....
4413 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4415 Input:
4416 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4417 in the loop.
4418 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4419 the loop. The increment amount across iterations is expected
4420 to be vector_size.
4421 BSI - location where the new update stmt is to be placed.
4422 STMT - the original scalar memory-access stmt that is being vectorized.
4423 BUMP - optional. The offset by which to bump the pointer. If not given,
4424 the offset is assumed to be vector_size.
4426 Output: Return NEW_DATAREF_PTR as illustrated above.
4430 tree
4431 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4432 gimple *stmt, tree bump)
4434 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4435 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4436 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4437 tree update = TYPE_SIZE_UNIT (vectype);
4438 gassign *incr_stmt;
4439 ssa_op_iter iter;
4440 use_operand_p use_p;
4441 tree new_dataref_ptr;
4443 if (bump)
4444 update = bump;
4446 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4447 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4448 else
4449 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4450 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4451 dataref_ptr, update);
4452 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4454 /* Copy the points-to information if it exists. */
4455 if (DR_PTR_INFO (dr))
4457 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4458 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4461 if (!ptr_incr)
4462 return new_dataref_ptr;
4464 /* Update the vector-pointer's cross-iteration increment. */
4465 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4467 tree use = USE_FROM_PTR (use_p);
4469 if (use == dataref_ptr)
4470 SET_USE (use_p, new_dataref_ptr);
4471 else
4472 gcc_assert (tree_int_cst_compare (use, update) == 0);
4475 return new_dataref_ptr;
4479 /* Function vect_create_destination_var.
4481 Create a new temporary of type VECTYPE. */
4483 tree
4484 vect_create_destination_var (tree scalar_dest, tree vectype)
4486 tree vec_dest;
4487 const char *name;
4488 char *new_name;
4489 tree type;
4490 enum vect_var_kind kind;
4492 kind = vectype
4493 ? VECTOR_BOOLEAN_TYPE_P (vectype)
4494 ? vect_mask_var
4495 : vect_simple_var
4496 : vect_scalar_var;
4497 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4499 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4501 name = get_name (scalar_dest);
4502 if (name)
4503 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4504 else
4505 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4506 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4507 free (new_name);
4509 return vec_dest;
4512 /* Function vect_grouped_store_supported.
4514 Returns TRUE if interleave high and interleave low permutations
4515 are supported, and FALSE otherwise. */
4517 bool
4518 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4520 machine_mode mode = TYPE_MODE (vectype);
4522 /* vect_permute_store_chain requires the group size to be equal to 3 or
4523 be a power of two. */
4524 if (count != 3 && exact_log2 (count) == -1)
4526 if (dump_enabled_p ())
4527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4528 "the size of the group of accesses"
4529 " is not a power of 2 or not eqaul to 3\n");
4530 return false;
4533 /* Check that the permutation is supported. */
4534 if (VECTOR_MODE_P (mode))
4536 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4537 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4539 if (count == 3)
4541 unsigned int j0 = 0, j1 = 0, j2 = 0;
4542 unsigned int i, j;
4544 for (j = 0; j < 3; j++)
4546 int nelt0 = ((3 - j) * nelt) % 3;
4547 int nelt1 = ((3 - j) * nelt + 1) % 3;
4548 int nelt2 = ((3 - j) * nelt + 2) % 3;
4549 for (i = 0; i < nelt; i++)
4551 if (3 * i + nelt0 < nelt)
4552 sel[3 * i + nelt0] = j0++;
4553 if (3 * i + nelt1 < nelt)
4554 sel[3 * i + nelt1] = nelt + j1++;
4555 if (3 * i + nelt2 < nelt)
4556 sel[3 * i + nelt2] = 0;
4558 if (!can_vec_perm_p (mode, false, sel))
4560 if (dump_enabled_p ())
4561 dump_printf (MSG_MISSED_OPTIMIZATION,
4562 "permutaion op not supported by target.\n");
4563 return false;
4566 for (i = 0; i < nelt; i++)
4568 if (3 * i + nelt0 < nelt)
4569 sel[3 * i + nelt0] = 3 * i + nelt0;
4570 if (3 * i + nelt1 < nelt)
4571 sel[3 * i + nelt1] = 3 * i + nelt1;
4572 if (3 * i + nelt2 < nelt)
4573 sel[3 * i + nelt2] = nelt + j2++;
4575 if (!can_vec_perm_p (mode, false, sel))
4577 if (dump_enabled_p ())
4578 dump_printf (MSG_MISSED_OPTIMIZATION,
4579 "permutaion op not supported by target.\n");
4580 return false;
4583 return true;
4585 else
4587 /* If length is not equal to 3 then only power of 2 is supported. */
4588 gcc_assert (pow2p_hwi (count));
4590 for (i = 0; i < nelt / 2; i++)
4592 sel[i * 2] = i;
4593 sel[i * 2 + 1] = i + nelt;
4595 if (can_vec_perm_p (mode, false, sel))
4597 for (i = 0; i < nelt; i++)
4598 sel[i] += nelt / 2;
4599 if (can_vec_perm_p (mode, false, sel))
4600 return true;
4605 if (dump_enabled_p ())
4606 dump_printf (MSG_MISSED_OPTIMIZATION,
4607 "permutaion op not supported by target.\n");
4608 return false;
4612 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4613 type VECTYPE. */
4615 bool
4616 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4618 return vect_lanes_optab_supported_p ("vec_store_lanes",
4619 vec_store_lanes_optab,
4620 vectype, count);
4624 /* Function vect_permute_store_chain.
4626 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4627 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4628 the data correctly for the stores. Return the final references for stores
4629 in RESULT_CHAIN.
4631 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4632 The input is 4 vectors each containing 8 elements. We assign a number to
4633 each element, the input sequence is:
4635 1st vec: 0 1 2 3 4 5 6 7
4636 2nd vec: 8 9 10 11 12 13 14 15
4637 3rd vec: 16 17 18 19 20 21 22 23
4638 4th vec: 24 25 26 27 28 29 30 31
4640 The output sequence should be:
4642 1st vec: 0 8 16 24 1 9 17 25
4643 2nd vec: 2 10 18 26 3 11 19 27
4644 3rd vec: 4 12 20 28 5 13 21 30
4645 4th vec: 6 14 22 30 7 15 23 31
4647 i.e., we interleave the contents of the four vectors in their order.
4649 We use interleave_high/low instructions to create such output. The input of
4650 each interleave_high/low operation is two vectors:
4651 1st vec 2nd vec
4652 0 1 2 3 4 5 6 7
4653 the even elements of the result vector are obtained left-to-right from the
4654 high/low elements of the first vector. The odd elements of the result are
4655 obtained left-to-right from the high/low elements of the second vector.
4656 The output of interleave_high will be: 0 4 1 5
4657 and of interleave_low: 2 6 3 7
4660 The permutation is done in log LENGTH stages. In each stage interleave_high
4661 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4662 where the first argument is taken from the first half of DR_CHAIN and the
4663 second argument from it's second half.
4664 In our example,
4666 I1: interleave_high (1st vec, 3rd vec)
4667 I2: interleave_low (1st vec, 3rd vec)
4668 I3: interleave_high (2nd vec, 4th vec)
4669 I4: interleave_low (2nd vec, 4th vec)
4671 The output for the first stage is:
4673 I1: 0 16 1 17 2 18 3 19
4674 I2: 4 20 5 21 6 22 7 23
4675 I3: 8 24 9 25 10 26 11 27
4676 I4: 12 28 13 29 14 30 15 31
4678 The output of the second stage, i.e. the final result is:
4680 I1: 0 8 16 24 1 9 17 25
4681 I2: 2 10 18 26 3 11 19 27
4682 I3: 4 12 20 28 5 13 21 30
4683 I4: 6 14 22 30 7 15 23 31. */
4685 void
4686 vect_permute_store_chain (vec<tree> dr_chain,
4687 unsigned int length,
4688 gimple *stmt,
4689 gimple_stmt_iterator *gsi,
4690 vec<tree> *result_chain)
4692 tree vect1, vect2, high, low;
4693 gimple *perm_stmt;
4694 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4695 tree perm_mask_low, perm_mask_high;
4696 tree data_ref;
4697 tree perm3_mask_low, perm3_mask_high;
4698 unsigned int i, n, log_length = exact_log2 (length);
4699 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4700 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4702 result_chain->quick_grow (length);
4703 memcpy (result_chain->address (), dr_chain.address (),
4704 length * sizeof (tree));
4706 if (length == 3)
4708 unsigned int j0 = 0, j1 = 0, j2 = 0;
4710 for (j = 0; j < 3; j++)
4712 int nelt0 = ((3 - j) * nelt) % 3;
4713 int nelt1 = ((3 - j) * nelt + 1) % 3;
4714 int nelt2 = ((3 - j) * nelt + 2) % 3;
4716 for (i = 0; i < nelt; i++)
4718 if (3 * i + nelt0 < nelt)
4719 sel[3 * i + nelt0] = j0++;
4720 if (3 * i + nelt1 < nelt)
4721 sel[3 * i + nelt1] = nelt + j1++;
4722 if (3 * i + nelt2 < nelt)
4723 sel[3 * i + nelt2] = 0;
4725 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4727 for (i = 0; i < nelt; i++)
4729 if (3 * i + nelt0 < nelt)
4730 sel[3 * i + nelt0] = 3 * i + nelt0;
4731 if (3 * i + nelt1 < nelt)
4732 sel[3 * i + nelt1] = 3 * i + nelt1;
4733 if (3 * i + nelt2 < nelt)
4734 sel[3 * i + nelt2] = nelt + j2++;
4736 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4738 vect1 = dr_chain[0];
4739 vect2 = dr_chain[1];
4741 /* Create interleaving stmt:
4742 low = VEC_PERM_EXPR <vect1, vect2,
4743 {j, nelt, *, j + 1, nelt + j + 1, *,
4744 j + 2, nelt + j + 2, *, ...}> */
4745 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4746 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4747 vect2, perm3_mask_low);
4748 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4750 vect1 = data_ref;
4751 vect2 = dr_chain[2];
4752 /* Create interleaving stmt:
4753 low = VEC_PERM_EXPR <vect1, vect2,
4754 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4755 6, 7, nelt + j + 2, ...}> */
4756 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4757 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4758 vect2, perm3_mask_high);
4759 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4760 (*result_chain)[j] = data_ref;
4763 else
4765 /* If length is not equal to 3 then only power of 2 is supported. */
4766 gcc_assert (pow2p_hwi (length));
4768 for (i = 0, n = nelt / 2; i < n; i++)
4770 sel[i * 2] = i;
4771 sel[i * 2 + 1] = i + nelt;
4773 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4775 for (i = 0; i < nelt; i++)
4776 sel[i] += nelt / 2;
4777 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4779 for (i = 0, n = log_length; i < n; i++)
4781 for (j = 0; j < length/2; j++)
4783 vect1 = dr_chain[j];
4784 vect2 = dr_chain[j+length/2];
4786 /* Create interleaving stmt:
4787 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4788 ...}> */
4789 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4790 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4791 vect2, perm_mask_high);
4792 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4793 (*result_chain)[2*j] = high;
4795 /* Create interleaving stmt:
4796 low = VEC_PERM_EXPR <vect1, vect2,
4797 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4798 ...}> */
4799 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4800 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4801 vect2, perm_mask_low);
4802 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4803 (*result_chain)[2*j+1] = low;
4805 memcpy (dr_chain.address (), result_chain->address (),
4806 length * sizeof (tree));
4811 /* Function vect_setup_realignment
4813 This function is called when vectorizing an unaligned load using
4814 the dr_explicit_realign[_optimized] scheme.
4815 This function generates the following code at the loop prolog:
4817 p = initial_addr;
4818 x msq_init = *(floor(p)); # prolog load
4819 realignment_token = call target_builtin;
4820 loop:
4821 x msq = phi (msq_init, ---)
4823 The stmts marked with x are generated only for the case of
4824 dr_explicit_realign_optimized.
4826 The code above sets up a new (vector) pointer, pointing to the first
4827 location accessed by STMT, and a "floor-aligned" load using that pointer.
4828 It also generates code to compute the "realignment-token" (if the relevant
4829 target hook was defined), and creates a phi-node at the loop-header bb
4830 whose arguments are the result of the prolog-load (created by this
4831 function) and the result of a load that takes place in the loop (to be
4832 created by the caller to this function).
4834 For the case of dr_explicit_realign_optimized:
4835 The caller to this function uses the phi-result (msq) to create the
4836 realignment code inside the loop, and sets up the missing phi argument,
4837 as follows:
4838 loop:
4839 msq = phi (msq_init, lsq)
4840 lsq = *(floor(p')); # load in loop
4841 result = realign_load (msq, lsq, realignment_token);
4843 For the case of dr_explicit_realign:
4844 loop:
4845 msq = *(floor(p)); # load in loop
4846 p' = p + (VS-1);
4847 lsq = *(floor(p')); # load in loop
4848 result = realign_load (msq, lsq, realignment_token);
4850 Input:
4851 STMT - (scalar) load stmt to be vectorized. This load accesses
4852 a memory location that may be unaligned.
4853 BSI - place where new code is to be inserted.
4854 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4855 is used.
4857 Output:
4858 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4859 target hook, if defined.
4860 Return value - the result of the loop-header phi node. */
4862 tree
4863 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4864 tree *realignment_token,
4865 enum dr_alignment_support alignment_support_scheme,
4866 tree init_addr,
4867 struct loop **at_loop)
4869 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4870 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4871 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4872 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4873 struct loop *loop = NULL;
4874 edge pe = NULL;
4875 tree scalar_dest = gimple_assign_lhs (stmt);
4876 tree vec_dest;
4877 gimple *inc;
4878 tree ptr;
4879 tree data_ref;
4880 basic_block new_bb;
4881 tree msq_init = NULL_TREE;
4882 tree new_temp;
4883 gphi *phi_stmt;
4884 tree msq = NULL_TREE;
4885 gimple_seq stmts = NULL;
4886 bool inv_p;
4887 bool compute_in_loop = false;
4888 bool nested_in_vect_loop = false;
4889 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4890 struct loop *loop_for_initial_load = NULL;
4892 if (loop_vinfo)
4894 loop = LOOP_VINFO_LOOP (loop_vinfo);
4895 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4898 gcc_assert (alignment_support_scheme == dr_explicit_realign
4899 || alignment_support_scheme == dr_explicit_realign_optimized);
4901 /* We need to generate three things:
4902 1. the misalignment computation
4903 2. the extra vector load (for the optimized realignment scheme).
4904 3. the phi node for the two vectors from which the realignment is
4905 done (for the optimized realignment scheme). */
4907 /* 1. Determine where to generate the misalignment computation.
4909 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4910 calculation will be generated by this function, outside the loop (in the
4911 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4912 caller, inside the loop.
4914 Background: If the misalignment remains fixed throughout the iterations of
4915 the loop, then both realignment schemes are applicable, and also the
4916 misalignment computation can be done outside LOOP. This is because we are
4917 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4918 are a multiple of VS (the Vector Size), and therefore the misalignment in
4919 different vectorized LOOP iterations is always the same.
4920 The problem arises only if the memory access is in an inner-loop nested
4921 inside LOOP, which is now being vectorized using outer-loop vectorization.
4922 This is the only case when the misalignment of the memory access may not
4923 remain fixed throughout the iterations of the inner-loop (as explained in
4924 detail in vect_supportable_dr_alignment). In this case, not only is the
4925 optimized realignment scheme not applicable, but also the misalignment
4926 computation (and generation of the realignment token that is passed to
4927 REALIGN_LOAD) have to be done inside the loop.
4929 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4930 or not, which in turn determines if the misalignment is computed inside
4931 the inner-loop, or outside LOOP. */
4933 if (init_addr != NULL_TREE || !loop_vinfo)
4935 compute_in_loop = true;
4936 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4940 /* 2. Determine where to generate the extra vector load.
4942 For the optimized realignment scheme, instead of generating two vector
4943 loads in each iteration, we generate a single extra vector load in the
4944 preheader of the loop, and in each iteration reuse the result of the
4945 vector load from the previous iteration. In case the memory access is in
4946 an inner-loop nested inside LOOP, which is now being vectorized using
4947 outer-loop vectorization, we need to determine whether this initial vector
4948 load should be generated at the preheader of the inner-loop, or can be
4949 generated at the preheader of LOOP. If the memory access has no evolution
4950 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4951 to be generated inside LOOP (in the preheader of the inner-loop). */
4953 if (nested_in_vect_loop)
4955 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4956 bool invariant_in_outerloop =
4957 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4958 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4960 else
4961 loop_for_initial_load = loop;
4962 if (at_loop)
4963 *at_loop = loop_for_initial_load;
4965 if (loop_for_initial_load)
4966 pe = loop_preheader_edge (loop_for_initial_load);
4968 /* 3. For the case of the optimized realignment, create the first vector
4969 load at the loop preheader. */
4971 if (alignment_support_scheme == dr_explicit_realign_optimized)
4973 /* Create msq_init = *(floor(p1)) in the loop preheader */
4974 gassign *new_stmt;
4976 gcc_assert (!compute_in_loop);
4977 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4978 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4979 NULL_TREE, &init_addr, NULL, &inc,
4980 true, &inv_p);
4981 if (TREE_CODE (ptr) == SSA_NAME)
4982 new_temp = copy_ssa_name (ptr);
4983 else
4984 new_temp = make_ssa_name (TREE_TYPE (ptr));
4985 new_stmt = gimple_build_assign
4986 (new_temp, BIT_AND_EXPR, ptr,
4987 build_int_cst (TREE_TYPE (ptr),
4988 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4989 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4990 gcc_assert (!new_bb);
4991 data_ref
4992 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4993 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4994 new_stmt = gimple_build_assign (vec_dest, data_ref);
4995 new_temp = make_ssa_name (vec_dest, new_stmt);
4996 gimple_assign_set_lhs (new_stmt, new_temp);
4997 if (pe)
4999 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5000 gcc_assert (!new_bb);
5002 else
5003 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5005 msq_init = gimple_assign_lhs (new_stmt);
5008 /* 4. Create realignment token using a target builtin, if available.
5009 It is done either inside the containing loop, or before LOOP (as
5010 determined above). */
5012 if (targetm.vectorize.builtin_mask_for_load)
5014 gcall *new_stmt;
5015 tree builtin_decl;
5017 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
5018 if (!init_addr)
5020 /* Generate the INIT_ADDR computation outside LOOP. */
5021 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
5022 NULL_TREE, loop);
5023 if (loop)
5025 pe = loop_preheader_edge (loop);
5026 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5027 gcc_assert (!new_bb);
5029 else
5030 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5033 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5034 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5035 vec_dest =
5036 vect_create_destination_var (scalar_dest,
5037 gimple_call_return_type (new_stmt));
5038 new_temp = make_ssa_name (vec_dest, new_stmt);
5039 gimple_call_set_lhs (new_stmt, new_temp);
5041 if (compute_in_loop)
5042 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5043 else
5045 /* Generate the misalignment computation outside LOOP. */
5046 pe = loop_preheader_edge (loop);
5047 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5048 gcc_assert (!new_bb);
5051 *realignment_token = gimple_call_lhs (new_stmt);
5053 /* The result of the CALL_EXPR to this builtin is determined from
5054 the value of the parameter and no global variables are touched
5055 which makes the builtin a "const" function. Requiring the
5056 builtin to have the "const" attribute makes it unnecessary
5057 to call mark_call_clobbered. */
5058 gcc_assert (TREE_READONLY (builtin_decl));
5061 if (alignment_support_scheme == dr_explicit_realign)
5062 return msq;
5064 gcc_assert (!compute_in_loop);
5065 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5068 /* 5. Create msq = phi <msq_init, lsq> in loop */
5070 pe = loop_preheader_edge (containing_loop);
5071 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5072 msq = make_ssa_name (vec_dest);
5073 phi_stmt = create_phi_node (msq, containing_loop->header);
5074 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5076 return msq;
5080 /* Function vect_grouped_load_supported.
5082 COUNT is the size of the load group (the number of statements plus the
5083 number of gaps). SINGLE_ELEMENT_P is true if there is actually
5084 only one statement, with a gap of COUNT - 1.
5086 Returns true if a suitable permute exists. */
5088 bool
5089 vect_grouped_load_supported (tree vectype, bool single_element_p,
5090 unsigned HOST_WIDE_INT count)
5092 machine_mode mode = TYPE_MODE (vectype);
5094 /* If this is single-element interleaving with an element distance
5095 that leaves unused vector loads around punt - we at least create
5096 very sub-optimal code in that case (and blow up memory,
5097 see PR65518). */
5098 if (single_element_p && count > TYPE_VECTOR_SUBPARTS (vectype))
5100 if (dump_enabled_p ())
5101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5102 "single-element interleaving not supported "
5103 "for not adjacent vector loads\n");
5104 return false;
5107 /* vect_permute_load_chain requires the group size to be equal to 3 or
5108 be a power of two. */
5109 if (count != 3 && exact_log2 (count) == -1)
5111 if (dump_enabled_p ())
5112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5113 "the size of the group of accesses"
5114 " is not a power of 2 or not equal to 3\n");
5115 return false;
5118 /* Check that the permutation is supported. */
5119 if (VECTOR_MODE_P (mode))
5121 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5122 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5124 if (count == 3)
5126 unsigned int k;
5127 for (k = 0; k < 3; k++)
5129 for (i = 0; i < nelt; i++)
5130 if (3 * i + k < 2 * nelt)
5131 sel[i] = 3 * i + k;
5132 else
5133 sel[i] = 0;
5134 if (!can_vec_perm_p (mode, false, sel))
5136 if (dump_enabled_p ())
5137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5138 "shuffle of 3 loads is not supported by"
5139 " target\n");
5140 return false;
5142 for (i = 0, j = 0; i < nelt; i++)
5143 if (3 * i + k < 2 * nelt)
5144 sel[i] = i;
5145 else
5146 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5147 if (!can_vec_perm_p (mode, false, sel))
5149 if (dump_enabled_p ())
5150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5151 "shuffle of 3 loads is not supported by"
5152 " target\n");
5153 return false;
5156 return true;
5158 else
5160 /* If length is not equal to 3 then only power of 2 is supported. */
5161 gcc_assert (pow2p_hwi (count));
5162 for (i = 0; i < nelt; i++)
5163 sel[i] = i * 2;
5164 if (can_vec_perm_p (mode, false, sel))
5166 for (i = 0; i < nelt; i++)
5167 sel[i] = i * 2 + 1;
5168 if (can_vec_perm_p (mode, false, sel))
5169 return true;
5174 if (dump_enabled_p ())
5175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5176 "extract even/odd not supported by target\n");
5177 return false;
5180 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5181 type VECTYPE. */
5183 bool
5184 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5186 return vect_lanes_optab_supported_p ("vec_load_lanes",
5187 vec_load_lanes_optab,
5188 vectype, count);
5191 /* Function vect_permute_load_chain.
5193 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5194 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5195 the input data correctly. Return the final references for loads in
5196 RESULT_CHAIN.
5198 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5199 The input is 4 vectors each containing 8 elements. We assign a number to each
5200 element, the input sequence is:
5202 1st vec: 0 1 2 3 4 5 6 7
5203 2nd vec: 8 9 10 11 12 13 14 15
5204 3rd vec: 16 17 18 19 20 21 22 23
5205 4th vec: 24 25 26 27 28 29 30 31
5207 The output sequence should be:
5209 1st vec: 0 4 8 12 16 20 24 28
5210 2nd vec: 1 5 9 13 17 21 25 29
5211 3rd vec: 2 6 10 14 18 22 26 30
5212 4th vec: 3 7 11 15 19 23 27 31
5214 i.e., the first output vector should contain the first elements of each
5215 interleaving group, etc.
5217 We use extract_even/odd instructions to create such output. The input of
5218 each extract_even/odd operation is two vectors
5219 1st vec 2nd vec
5220 0 1 2 3 4 5 6 7
5222 and the output is the vector of extracted even/odd elements. The output of
5223 extract_even will be: 0 2 4 6
5224 and of extract_odd: 1 3 5 7
5227 The permutation is done in log LENGTH stages. In each stage extract_even
5228 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5229 their order. In our example,
5231 E1: extract_even (1st vec, 2nd vec)
5232 E2: extract_odd (1st vec, 2nd vec)
5233 E3: extract_even (3rd vec, 4th vec)
5234 E4: extract_odd (3rd vec, 4th vec)
5236 The output for the first stage will be:
5238 E1: 0 2 4 6 8 10 12 14
5239 E2: 1 3 5 7 9 11 13 15
5240 E3: 16 18 20 22 24 26 28 30
5241 E4: 17 19 21 23 25 27 29 31
5243 In order to proceed and create the correct sequence for the next stage (or
5244 for the correct output, if the second stage is the last one, as in our
5245 example), we first put the output of extract_even operation and then the
5246 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5247 The input for the second stage is:
5249 1st vec (E1): 0 2 4 6 8 10 12 14
5250 2nd vec (E3): 16 18 20 22 24 26 28 30
5251 3rd vec (E2): 1 3 5 7 9 11 13 15
5252 4th vec (E4): 17 19 21 23 25 27 29 31
5254 The output of the second stage:
5256 E1: 0 4 8 12 16 20 24 28
5257 E2: 2 6 10 14 18 22 26 30
5258 E3: 1 5 9 13 17 21 25 29
5259 E4: 3 7 11 15 19 23 27 31
5261 And RESULT_CHAIN after reordering:
5263 1st vec (E1): 0 4 8 12 16 20 24 28
5264 2nd vec (E3): 1 5 9 13 17 21 25 29
5265 3rd vec (E2): 2 6 10 14 18 22 26 30
5266 4th vec (E4): 3 7 11 15 19 23 27 31. */
5268 static void
5269 vect_permute_load_chain (vec<tree> dr_chain,
5270 unsigned int length,
5271 gimple *stmt,
5272 gimple_stmt_iterator *gsi,
5273 vec<tree> *result_chain)
5275 tree data_ref, first_vect, second_vect;
5276 tree perm_mask_even, perm_mask_odd;
5277 tree perm3_mask_low, perm3_mask_high;
5278 gimple *perm_stmt;
5279 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5280 unsigned int i, j, log_length = exact_log2 (length);
5281 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5282 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5284 result_chain->quick_grow (length);
5285 memcpy (result_chain->address (), dr_chain.address (),
5286 length * sizeof (tree));
5288 if (length == 3)
5290 unsigned int k;
5292 for (k = 0; k < 3; k++)
5294 for (i = 0; i < nelt; i++)
5295 if (3 * i + k < 2 * nelt)
5296 sel[i] = 3 * i + k;
5297 else
5298 sel[i] = 0;
5299 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5301 for (i = 0, j = 0; i < nelt; i++)
5302 if (3 * i + k < 2 * nelt)
5303 sel[i] = i;
5304 else
5305 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5307 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5309 first_vect = dr_chain[0];
5310 second_vect = dr_chain[1];
5312 /* Create interleaving stmt (low part of):
5313 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5314 ...}> */
5315 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5316 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5317 second_vect, perm3_mask_low);
5318 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5320 /* Create interleaving stmt (high part of):
5321 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5322 ...}> */
5323 first_vect = data_ref;
5324 second_vect = dr_chain[2];
5325 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5326 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5327 second_vect, perm3_mask_high);
5328 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5329 (*result_chain)[k] = data_ref;
5332 else
5334 /* If length is not equal to 3 then only power of 2 is supported. */
5335 gcc_assert (pow2p_hwi (length));
5337 for (i = 0; i < nelt; ++i)
5338 sel[i] = i * 2;
5339 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5341 for (i = 0; i < nelt; ++i)
5342 sel[i] = i * 2 + 1;
5343 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5345 for (i = 0; i < log_length; i++)
5347 for (j = 0; j < length; j += 2)
5349 first_vect = dr_chain[j];
5350 second_vect = dr_chain[j+1];
5352 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5353 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5354 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5355 first_vect, second_vect,
5356 perm_mask_even);
5357 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5358 (*result_chain)[j/2] = data_ref;
5360 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5361 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5362 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5363 first_vect, second_vect,
5364 perm_mask_odd);
5365 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5366 (*result_chain)[j/2+length/2] = data_ref;
5368 memcpy (dr_chain.address (), result_chain->address (),
5369 length * sizeof (tree));
5374 /* Function vect_shift_permute_load_chain.
5376 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5377 sequence of stmts to reorder the input data accordingly.
5378 Return the final references for loads in RESULT_CHAIN.
5379 Return true if successed, false otherwise.
5381 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5382 The input is 3 vectors each containing 8 elements. We assign a
5383 number to each element, the input sequence is:
5385 1st vec: 0 1 2 3 4 5 6 7
5386 2nd vec: 8 9 10 11 12 13 14 15
5387 3rd vec: 16 17 18 19 20 21 22 23
5389 The output sequence should be:
5391 1st vec: 0 3 6 9 12 15 18 21
5392 2nd vec: 1 4 7 10 13 16 19 22
5393 3rd vec: 2 5 8 11 14 17 20 23
5395 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5397 First we shuffle all 3 vectors to get correct elements order:
5399 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5400 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5401 3rd vec: (16 19 22) (17 20 23) (18 21)
5403 Next we unite and shift vector 3 times:
5405 1st step:
5406 shift right by 6 the concatenation of:
5407 "1st vec" and "2nd vec"
5408 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5409 "2nd vec" and "3rd vec"
5410 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5411 "3rd vec" and "1st vec"
5412 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5413 | New vectors |
5415 So that now new vectors are:
5417 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5418 2nd vec: (10 13) (16 19 22) (17 20 23)
5419 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5421 2nd step:
5422 shift right by 5 the concatenation of:
5423 "1st vec" and "3rd vec"
5424 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5425 "2nd vec" and "1st vec"
5426 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5427 "3rd vec" and "2nd vec"
5428 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5429 | New vectors |
5431 So that now new vectors are:
5433 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5434 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5435 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5437 3rd step:
5438 shift right by 5 the concatenation of:
5439 "1st vec" and "1st vec"
5440 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5441 shift right by 3 the concatenation of:
5442 "2nd vec" and "2nd vec"
5443 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5444 | New vectors |
5446 So that now all vectors are READY:
5447 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5448 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5449 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5451 This algorithm is faster than one in vect_permute_load_chain if:
5452 1. "shift of a concatination" is faster than general permutation.
5453 This is usually so.
5454 2. The TARGET machine can't execute vector instructions in parallel.
5455 This is because each step of the algorithm depends on previous.
5456 The algorithm in vect_permute_load_chain is much more parallel.
5458 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5461 static bool
5462 vect_shift_permute_load_chain (vec<tree> dr_chain,
5463 unsigned int length,
5464 gimple *stmt,
5465 gimple_stmt_iterator *gsi,
5466 vec<tree> *result_chain)
5468 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5469 tree perm2_mask1, perm2_mask2, perm3_mask;
5470 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5471 gimple *perm_stmt;
5473 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5474 unsigned int i;
5475 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5476 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5477 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5478 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5480 result_chain->quick_grow (length);
5481 memcpy (result_chain->address (), dr_chain.address (),
5482 length * sizeof (tree));
5484 if (pow2p_hwi (length) && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5486 unsigned int j, log_length = exact_log2 (length);
5487 for (i = 0; i < nelt / 2; ++i)
5488 sel[i] = i * 2;
5489 for (i = 0; i < nelt / 2; ++i)
5490 sel[nelt / 2 + i] = i * 2 + 1;
5491 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5493 if (dump_enabled_p ())
5494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5495 "shuffle of 2 fields structure is not \
5496 supported by target\n");
5497 return false;
5499 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5501 for (i = 0; i < nelt / 2; ++i)
5502 sel[i] = i * 2 + 1;
5503 for (i = 0; i < nelt / 2; ++i)
5504 sel[nelt / 2 + i] = i * 2;
5505 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5507 if (dump_enabled_p ())
5508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5509 "shuffle of 2 fields structure is not \
5510 supported by target\n");
5511 return false;
5513 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5515 /* Generating permutation constant to shift all elements.
5516 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5517 for (i = 0; i < nelt; i++)
5518 sel[i] = nelt / 2 + i;
5519 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5521 if (dump_enabled_p ())
5522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5523 "shift permutation is not supported by target\n");
5524 return false;
5526 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5528 /* Generating permutation constant to select vector from 2.
5529 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5530 for (i = 0; i < nelt / 2; i++)
5531 sel[i] = i;
5532 for (i = nelt / 2; i < nelt; i++)
5533 sel[i] = nelt + i;
5534 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5536 if (dump_enabled_p ())
5537 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5538 "select is not supported by target\n");
5539 return false;
5541 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5543 for (i = 0; i < log_length; i++)
5545 for (j = 0; j < length; j += 2)
5547 first_vect = dr_chain[j];
5548 second_vect = dr_chain[j + 1];
5550 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5551 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5552 first_vect, first_vect,
5553 perm2_mask1);
5554 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5555 vect[0] = data_ref;
5557 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5558 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5559 second_vect, second_vect,
5560 perm2_mask2);
5561 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5562 vect[1] = data_ref;
5564 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5565 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5566 vect[0], vect[1], shift1_mask);
5567 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5568 (*result_chain)[j/2 + length/2] = data_ref;
5570 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5571 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5572 vect[0], vect[1], select_mask);
5573 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5574 (*result_chain)[j/2] = data_ref;
5576 memcpy (dr_chain.address (), result_chain->address (),
5577 length * sizeof (tree));
5579 return true;
5581 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5583 unsigned int k = 0, l = 0;
5585 /* Generating permutation constant to get all elements in rigth order.
5586 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5587 for (i = 0; i < nelt; i++)
5589 if (3 * k + (l % 3) >= nelt)
5591 k = 0;
5592 l += (3 - (nelt % 3));
5594 sel[i] = 3 * k + (l % 3);
5595 k++;
5597 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5599 if (dump_enabled_p ())
5600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5601 "shuffle of 3 fields structure is not \
5602 supported by target\n");
5603 return false;
5605 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5607 /* Generating permutation constant to shift all elements.
5608 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5609 for (i = 0; i < nelt; i++)
5610 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5611 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5613 if (dump_enabled_p ())
5614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5615 "shift permutation is not supported by target\n");
5616 return false;
5618 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5620 /* Generating permutation constant to shift all elements.
5621 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5622 for (i = 0; i < nelt; i++)
5623 sel[i] = 2 * (nelt / 3) + 1 + i;
5624 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5626 if (dump_enabled_p ())
5627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5628 "shift permutation is not supported by target\n");
5629 return false;
5631 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5633 /* Generating permutation constant to shift all elements.
5634 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5635 for (i = 0; i < nelt; i++)
5636 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5637 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5639 if (dump_enabled_p ())
5640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5641 "shift permutation is not supported by target\n");
5642 return false;
5644 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5646 /* Generating permutation constant to shift all elements.
5647 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5648 for (i = 0; i < nelt; i++)
5649 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5650 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5652 if (dump_enabled_p ())
5653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5654 "shift permutation is not supported by target\n");
5655 return false;
5657 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5659 for (k = 0; k < 3; k++)
5661 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5662 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5663 dr_chain[k], dr_chain[k],
5664 perm3_mask);
5665 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5666 vect[k] = data_ref;
5669 for (k = 0; k < 3; k++)
5671 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5672 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5673 vect[k % 3], vect[(k + 1) % 3],
5674 shift1_mask);
5675 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5676 vect_shift[k] = data_ref;
5679 for (k = 0; k < 3; k++)
5681 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5682 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5683 vect_shift[(4 - k) % 3],
5684 vect_shift[(3 - k) % 3],
5685 shift2_mask);
5686 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5687 vect[k] = data_ref;
5690 (*result_chain)[3 - (nelt % 3)] = vect[2];
5692 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5693 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5694 vect[0], shift3_mask);
5695 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5696 (*result_chain)[nelt % 3] = data_ref;
5698 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5699 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5700 vect[1], shift4_mask);
5701 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5702 (*result_chain)[0] = data_ref;
5703 return true;
5705 return false;
5708 /* Function vect_transform_grouped_load.
5710 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5711 to perform their permutation and ascribe the result vectorized statements to
5712 the scalar statements.
5715 void
5716 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5717 gimple_stmt_iterator *gsi)
5719 machine_mode mode;
5720 vec<tree> result_chain = vNULL;
5722 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5723 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5724 vectors, that are ready for vector computation. */
5725 result_chain.create (size);
5727 /* If reassociation width for vector type is 2 or greater target machine can
5728 execute 2 or more vector instructions in parallel. Otherwise try to
5729 get chain for loads group using vect_shift_permute_load_chain. */
5730 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5731 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5732 || pow2p_hwi (size)
5733 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5734 gsi, &result_chain))
5735 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5736 vect_record_grouped_load_vectors (stmt, result_chain);
5737 result_chain.release ();
5740 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5741 generated as part of the vectorization of STMT. Assign the statement
5742 for each vector to the associated scalar statement. */
5744 void
5745 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5747 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5748 gimple *next_stmt, *new_stmt;
5749 unsigned int i, gap_count;
5750 tree tmp_data_ref;
5752 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5753 Since we scan the chain starting from it's first node, their order
5754 corresponds the order of data-refs in RESULT_CHAIN. */
5755 next_stmt = first_stmt;
5756 gap_count = 1;
5757 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5759 if (!next_stmt)
5760 break;
5762 /* Skip the gaps. Loads created for the gaps will be removed by dead
5763 code elimination pass later. No need to check for the first stmt in
5764 the group, since it always exists.
5765 GROUP_GAP is the number of steps in elements from the previous
5766 access (if there is no gap GROUP_GAP is 1). We skip loads that
5767 correspond to the gaps. */
5768 if (next_stmt != first_stmt
5769 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5771 gap_count++;
5772 continue;
5775 while (next_stmt)
5777 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5778 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5779 copies, and we put the new vector statement in the first available
5780 RELATED_STMT. */
5781 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5782 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5783 else
5785 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5787 gimple *prev_stmt =
5788 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5789 gimple *rel_stmt =
5790 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5791 while (rel_stmt)
5793 prev_stmt = rel_stmt;
5794 rel_stmt =
5795 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5798 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5799 new_stmt;
5803 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5804 gap_count = 1;
5805 /* If NEXT_STMT accesses the same DR as the previous statement,
5806 put the same TMP_DATA_REF as its vectorized statement; otherwise
5807 get the next data-ref from RESULT_CHAIN. */
5808 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5809 break;
5814 /* Function vect_force_dr_alignment_p.
5816 Returns whether the alignment of a DECL can be forced to be aligned
5817 on ALIGNMENT bit boundary. */
5819 bool
5820 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5822 if (!VAR_P (decl))
5823 return false;
5825 if (decl_in_symtab_p (decl)
5826 && !symtab_node::get (decl)->can_increase_alignment_p ())
5827 return false;
5829 if (TREE_STATIC (decl))
5830 return (alignment <= MAX_OFILE_ALIGNMENT);
5831 else
5832 return (alignment <= MAX_STACK_ALIGNMENT);
5836 /* Return whether the data reference DR is supported with respect to its
5837 alignment.
5838 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5839 it is aligned, i.e., check if it is possible to vectorize it with different
5840 alignment. */
5842 enum dr_alignment_support
5843 vect_supportable_dr_alignment (struct data_reference *dr,
5844 bool check_aligned_accesses)
5846 gimple *stmt = DR_STMT (dr);
5847 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5848 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5849 machine_mode mode = TYPE_MODE (vectype);
5850 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5851 struct loop *vect_loop = NULL;
5852 bool nested_in_vect_loop = false;
5854 if (aligned_access_p (dr) && !check_aligned_accesses)
5855 return dr_aligned;
5857 /* For now assume all conditional loads/stores support unaligned
5858 access without any special code. */
5859 if (is_gimple_call (stmt)
5860 && gimple_call_internal_p (stmt)
5861 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5862 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5863 return dr_unaligned_supported;
5865 if (loop_vinfo)
5867 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5868 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5871 /* Possibly unaligned access. */
5873 /* We can choose between using the implicit realignment scheme (generating
5874 a misaligned_move stmt) and the explicit realignment scheme (generating
5875 aligned loads with a REALIGN_LOAD). There are two variants to the
5876 explicit realignment scheme: optimized, and unoptimized.
5877 We can optimize the realignment only if the step between consecutive
5878 vector loads is equal to the vector size. Since the vector memory
5879 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5880 is guaranteed that the misalignment amount remains the same throughout the
5881 execution of the vectorized loop. Therefore, we can create the
5882 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5883 at the loop preheader.
5885 However, in the case of outer-loop vectorization, when vectorizing a
5886 memory access in the inner-loop nested within the LOOP that is now being
5887 vectorized, while it is guaranteed that the misalignment of the
5888 vectorized memory access will remain the same in different outer-loop
5889 iterations, it is *not* guaranteed that is will remain the same throughout
5890 the execution of the inner-loop. This is because the inner-loop advances
5891 with the original scalar step (and not in steps of VS). If the inner-loop
5892 step happens to be a multiple of VS, then the misalignment remains fixed
5893 and we can use the optimized realignment scheme. For example:
5895 for (i=0; i<N; i++)
5896 for (j=0; j<M; j++)
5897 s += a[i+j];
5899 When vectorizing the i-loop in the above example, the step between
5900 consecutive vector loads is 1, and so the misalignment does not remain
5901 fixed across the execution of the inner-loop, and the realignment cannot
5902 be optimized (as illustrated in the following pseudo vectorized loop):
5904 for (i=0; i<N; i+=4)
5905 for (j=0; j<M; j++){
5906 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5907 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5908 // (assuming that we start from an aligned address).
5911 We therefore have to use the unoptimized realignment scheme:
5913 for (i=0; i<N; i+=4)
5914 for (j=k; j<M; j+=4)
5915 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5916 // that the misalignment of the initial address is
5917 // 0).
5919 The loop can then be vectorized as follows:
5921 for (k=0; k<4; k++){
5922 rt = get_realignment_token (&vp[k]);
5923 for (i=0; i<N; i+=4){
5924 v1 = vp[i+k];
5925 for (j=k; j<M; j+=4){
5926 v2 = vp[i+j+VS-1];
5927 va = REALIGN_LOAD <v1,v2,rt>;
5928 vs += va;
5929 v1 = v2;
5932 } */
5934 if (DR_IS_READ (dr))
5936 bool is_packed = false;
5937 tree type = (TREE_TYPE (DR_REF (dr)));
5939 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5940 && (!targetm.vectorize.builtin_mask_for_load
5941 || targetm.vectorize.builtin_mask_for_load ()))
5943 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5945 /* If we are doing SLP then the accesses need not have the
5946 same alignment, instead it depends on the SLP group size. */
5947 if (loop_vinfo
5948 && STMT_SLP_TYPE (stmt_info)
5949 && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5950 * GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)))
5951 % TYPE_VECTOR_SUBPARTS (vectype) != 0))
5953 else if (!loop_vinfo
5954 || (nested_in_vect_loop
5955 && (TREE_INT_CST_LOW (DR_STEP (dr))
5956 != GET_MODE_SIZE (TYPE_MODE (vectype)))))
5957 return dr_explicit_realign;
5958 else
5959 return dr_explicit_realign_optimized;
5961 if (!known_alignment_for_access_p (dr))
5962 is_packed = not_size_aligned (DR_REF (dr));
5964 if (targetm.vectorize.support_vector_misalignment
5965 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5966 /* Can't software pipeline the loads, but can at least do them. */
5967 return dr_unaligned_supported;
5969 else
5971 bool is_packed = false;
5972 tree type = (TREE_TYPE (DR_REF (dr)));
5974 if (!known_alignment_for_access_p (dr))
5975 is_packed = not_size_aligned (DR_REF (dr));
5977 if (targetm.vectorize.support_vector_misalignment
5978 (mode, type, DR_MISALIGNMENT (dr), is_packed))
5979 return dr_unaligned_supported;
5982 /* Unsupported. */
5983 return dr_unaligned_unsupported;