* include/bits/ptr_traits.h (__ptrtr_elt_type, __ptrtr_diff_type,
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob4f4cf4ead5bf4a0a4ab579f08c94dd02d398ccc2
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "alias.h"
28 #include "symtab.h"
29 #include "tree.h"
30 #include "fold-const.h"
31 #include "stor-layout.h"
32 #include "tm_p.h"
33 #include "target.h"
34 #include "predict.h"
35 #include "hard-reg-set.h"
36 #include "function.h"
37 #include "dominance.h"
38 #include "cfg.h"
39 #include "basic-block.h"
40 #include "gimple-pretty-print.h"
41 #include "tree-ssa-alias.h"
42 #include "internal-fn.h"
43 #include "tree-eh.h"
44 #include "gimple-expr.h"
45 #include "gimple.h"
46 #include "gimplify.h"
47 #include "gimple-iterator.h"
48 #include "gimplify-me.h"
49 #include "gimple-ssa.h"
50 #include "tree-phinodes.h"
51 #include "ssa-iterators.h"
52 #include "stringpool.h"
53 #include "tree-ssanames.h"
54 #include "tree-ssa-loop-ivopts.h"
55 #include "tree-ssa-loop-manip.h"
56 #include "tree-ssa-loop.h"
57 #include "cfgloop.h"
58 #include "tree-chrec.h"
59 #include "tree-scalar-evolution.h"
60 #include "tree-vectorizer.h"
61 #include "diagnostic-core.h"
62 #include "cgraph.h"
63 /* Need to include rtl.h, expr.h, etc. for optabs. */
64 #include "rtl.h"
65 #include "flags.h"
66 #include "insn-config.h"
67 #include "expmed.h"
68 #include "dojump.h"
69 #include "explow.h"
70 #include "calls.h"
71 #include "emit-rtl.h"
72 #include "varasm.h"
73 #include "stmt.h"
74 #include "expr.h"
75 #include "insn-codes.h"
76 #include "optabs.h"
77 #include "builtins.h"
79 /* Return true if load- or store-lanes optab OPTAB is implemented for
80 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
82 static bool
83 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
84 tree vectype, unsigned HOST_WIDE_INT count)
86 machine_mode mode, array_mode;
87 bool limit_p;
89 mode = TYPE_MODE (vectype);
90 limit_p = !targetm.array_mode_supported_p (mode, count);
91 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
92 MODE_INT, limit_p);
94 if (array_mode == BLKmode)
96 if (dump_enabled_p ())
97 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
98 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
99 GET_MODE_NAME (mode), count);
100 return false;
103 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
105 if (dump_enabled_p ())
106 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
107 "cannot use %s<%s><%s>\n", name,
108 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
109 return false;
112 if (dump_enabled_p ())
113 dump_printf_loc (MSG_NOTE, vect_location,
114 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
115 GET_MODE_NAME (mode));
117 return true;
121 /* Return the smallest scalar part of STMT.
122 This is used to determine the vectype of the stmt. We generally set the
123 vectype according to the type of the result (lhs). For stmts whose
124 result-type is different than the type of the arguments (e.g., demotion,
125 promotion), vectype will be reset appropriately (later). Note that we have
126 to visit the smallest datatype in this function, because that determines the
127 VF. If the smallest datatype in the loop is present only as the rhs of a
128 promotion operation - we'd miss it.
129 Such a case, where a variable of this datatype does not appear in the lhs
130 anywhere in the loop, can only occur if it's an invariant: e.g.:
131 'int_x = (int) short_inv', which we'd expect to have been optimized away by
132 invariant motion. However, we cannot rely on invariant motion to always
133 take invariants out of the loop, and so in the case of promotion we also
134 have to check the rhs.
135 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
136 types. */
138 tree
139 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
140 HOST_WIDE_INT *rhs_size_unit)
142 tree scalar_type = gimple_expr_type (stmt);
143 HOST_WIDE_INT lhs, rhs;
145 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
147 if (is_gimple_assign (stmt)
148 && (gimple_assign_cast_p (stmt)
149 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
150 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
151 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
153 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
155 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
156 if (rhs < lhs)
157 scalar_type = rhs_type;
160 *lhs_size_unit = lhs;
161 *rhs_size_unit = rhs;
162 return scalar_type;
166 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
167 tested at run-time. Return TRUE if DDR was successfully inserted.
168 Return false if versioning is not supported. */
170 static bool
171 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
173 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
175 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
176 return false;
178 if (dump_enabled_p ())
180 dump_printf_loc (MSG_NOTE, vect_location,
181 "mark for run-time aliasing test between ");
182 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
183 dump_printf (MSG_NOTE, " and ");
184 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
185 dump_printf (MSG_NOTE, "\n");
188 if (optimize_loop_nest_for_size_p (loop))
190 if (dump_enabled_p ())
191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
192 "versioning not supported when optimizing"
193 " for size.\n");
194 return false;
197 /* FORNOW: We don't support versioning with outer-loop vectorization. */
198 if (loop->inner)
200 if (dump_enabled_p ())
201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
202 "versioning not yet supported for outer-loops.\n");
203 return false;
206 /* FORNOW: We don't support creating runtime alias tests for non-constant
207 step. */
208 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
209 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
211 if (dump_enabled_p ())
212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
213 "versioning not yet supported for non-constant "
214 "step\n");
215 return false;
218 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
219 return true;
223 /* Function vect_analyze_data_ref_dependence.
225 Return TRUE if there (might) exist a dependence between a memory-reference
226 DRA and a memory-reference DRB. When versioning for alias may check a
227 dependence at run-time, return FALSE. Adjust *MAX_VF according to
228 the data dependence. */
230 static bool
231 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
232 loop_vec_info loop_vinfo, int *max_vf)
234 unsigned int i;
235 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
236 struct data_reference *dra = DDR_A (ddr);
237 struct data_reference *drb = DDR_B (ddr);
238 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
239 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
240 lambda_vector dist_v;
241 unsigned int loop_depth;
243 /* In loop analysis all data references should be vectorizable. */
244 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
245 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
246 gcc_unreachable ();
248 /* Independent data accesses. */
249 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
250 return false;
252 if (dra == drb
253 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
254 return false;
256 /* Even if we have an anti-dependence then, as the vectorized loop covers at
257 least two scalar iterations, there is always also a true dependence.
258 As the vectorizer does not re-order loads and stores we can ignore
259 the anti-dependence if TBAA can disambiguate both DRs similar to the
260 case with known negative distance anti-dependences (positive
261 distance anti-dependences would violate TBAA constraints). */
262 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
263 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
264 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
265 get_alias_set (DR_REF (drb))))
266 return false;
268 /* Unknown data dependence. */
269 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
271 /* If user asserted safelen consecutive iterations can be
272 executed concurrently, assume independence. */
273 if (loop->safelen >= 2)
275 if (loop->safelen < *max_vf)
276 *max_vf = loop->safelen;
277 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
278 return false;
281 if (STMT_VINFO_GATHER_P (stmtinfo_a)
282 || STMT_VINFO_GATHER_P (stmtinfo_b))
284 if (dump_enabled_p ())
286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
287 "versioning for alias not supported for: "
288 "can't determine dependence between ");
289 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
290 DR_REF (dra));
291 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
292 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
293 DR_REF (drb));
294 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
296 return true;
299 if (dump_enabled_p ())
301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
302 "versioning for alias required: "
303 "can't determine dependence between ");
304 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
305 DR_REF (dra));
306 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
307 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
308 DR_REF (drb));
309 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
312 /* Add to list of ddrs that need to be tested at run-time. */
313 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
316 /* Known data dependence. */
317 if (DDR_NUM_DIST_VECTS (ddr) == 0)
319 /* If user asserted safelen consecutive iterations can be
320 executed concurrently, assume independence. */
321 if (loop->safelen >= 2)
323 if (loop->safelen < *max_vf)
324 *max_vf = loop->safelen;
325 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
326 return false;
329 if (STMT_VINFO_GATHER_P (stmtinfo_a)
330 || STMT_VINFO_GATHER_P (stmtinfo_b))
332 if (dump_enabled_p ())
334 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
335 "versioning for alias not supported for: "
336 "bad dist vector for ");
337 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
338 DR_REF (dra));
339 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
340 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
341 DR_REF (drb));
342 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
344 return true;
347 if (dump_enabled_p ())
349 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
350 "versioning for alias required: "
351 "bad dist vector for ");
352 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
353 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
354 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
355 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
357 /* Add to list of ddrs that need to be tested at run-time. */
358 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
361 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
362 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
364 int dist = dist_v[loop_depth];
366 if (dump_enabled_p ())
367 dump_printf_loc (MSG_NOTE, vect_location,
368 "dependence distance = %d.\n", dist);
370 if (dist == 0)
372 if (dump_enabled_p ())
374 dump_printf_loc (MSG_NOTE, vect_location,
375 "dependence distance == 0 between ");
376 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
377 dump_printf (MSG_NOTE, " and ");
378 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
379 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
382 /* When we perform grouped accesses and perform implicit CSE
383 by detecting equal accesses and doing disambiguation with
384 runtime alias tests like for
385 .. = a[i];
386 .. = a[i+1];
387 a[i] = ..;
388 a[i+1] = ..;
389 *p = ..;
390 .. = a[i];
391 .. = a[i+1];
392 where we will end up loading { a[i], a[i+1] } once, make
393 sure that inserting group loads before the first load and
394 stores after the last store will do the right thing.
395 Similar for groups like
396 a[i] = ...;
397 ... = a[i];
398 a[i+1] = ...;
399 where loads from the group interleave with the store. */
400 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
401 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
403 gimple earlier_stmt;
404 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
405 if (DR_IS_WRITE
406 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
408 if (dump_enabled_p ())
409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
410 "READ_WRITE dependence in interleaving."
411 "\n");
412 return true;
416 continue;
419 if (dist > 0 && DDR_REVERSED_P (ddr))
421 /* If DDR_REVERSED_P the order of the data-refs in DDR was
422 reversed (to make distance vector positive), and the actual
423 distance is negative. */
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
426 "dependence distance negative.\n");
427 /* Record a negative dependence distance to later limit the
428 amount of stmt copying / unrolling we can perform.
429 Only need to handle read-after-write dependence. */
430 if (DR_IS_READ (drb)
431 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
432 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
433 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
434 continue;
437 if (abs (dist) >= 2
438 && abs (dist) < *max_vf)
440 /* The dependence distance requires reduction of the maximal
441 vectorization factor. */
442 *max_vf = abs (dist);
443 if (dump_enabled_p ())
444 dump_printf_loc (MSG_NOTE, vect_location,
445 "adjusting maximal vectorization factor to %i\n",
446 *max_vf);
449 if (abs (dist) >= *max_vf)
451 /* Dependence distance does not create dependence, as far as
452 vectorization is concerned, in this case. */
453 if (dump_enabled_p ())
454 dump_printf_loc (MSG_NOTE, vect_location,
455 "dependence distance >= VF.\n");
456 continue;
459 if (dump_enabled_p ())
461 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
462 "not vectorized, possible dependence "
463 "between data-refs ");
464 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
465 dump_printf (MSG_NOTE, " and ");
466 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
467 dump_printf (MSG_NOTE, "\n");
470 return true;
473 return false;
476 /* Function vect_analyze_data_ref_dependences.
478 Examine all the data references in the loop, and make sure there do not
479 exist any data dependences between them. Set *MAX_VF according to
480 the maximum vectorization factor the data dependences allow. */
482 bool
483 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
485 unsigned int i;
486 struct data_dependence_relation *ddr;
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_NOTE, vect_location,
490 "=== vect_analyze_data_ref_dependences ===\n");
492 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
493 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
494 &LOOP_VINFO_DDRS (loop_vinfo),
495 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
496 return false;
498 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
499 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
500 return false;
502 return true;
506 /* Function vect_slp_analyze_data_ref_dependence.
508 Return TRUE if there (might) exist a dependence between a memory-reference
509 DRA and a memory-reference DRB. When versioning for alias may check a
510 dependence at run-time, return FALSE. Adjust *MAX_VF according to
511 the data dependence. */
513 static bool
514 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
516 struct data_reference *dra = DDR_A (ddr);
517 struct data_reference *drb = DDR_B (ddr);
519 /* We need to check dependences of statements marked as unvectorizable
520 as well, they still can prohibit vectorization. */
522 /* Independent data accesses. */
523 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
524 return false;
526 if (dra == drb)
527 return false;
529 /* Read-read is OK. */
530 if (DR_IS_READ (dra) && DR_IS_READ (drb))
531 return false;
533 /* If dra and drb are part of the same interleaving chain consider
534 them independent. */
535 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
536 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
537 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
538 return false;
540 /* Unknown data dependence. */
541 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
543 if (dump_enabled_p ())
545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
546 "can't determine dependence between ");
547 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
548 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
549 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
550 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
553 else if (dump_enabled_p ())
555 dump_printf_loc (MSG_NOTE, vect_location,
556 "determined dependence between ");
557 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
558 dump_printf (MSG_NOTE, " and ");
559 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
560 dump_printf (MSG_NOTE, "\n");
563 /* We do not vectorize basic blocks with write-write dependencies. */
564 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
565 return true;
567 /* If we have a read-write dependence check that the load is before the store.
568 When we vectorize basic blocks, vector load can be only before
569 corresponding scalar load, and vector store can be only after its
570 corresponding scalar store. So the order of the acceses is preserved in
571 case the load is before the store. */
572 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
573 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
575 /* That only holds for load-store pairs taking part in vectorization. */
576 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
577 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
578 return false;
581 return true;
585 /* Function vect_analyze_data_ref_dependences.
587 Examine all the data references in the basic-block, and make sure there
588 do not exist any data dependences between them. Set *MAX_VF according to
589 the maximum vectorization factor the data dependences allow. */
591 bool
592 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
594 struct data_dependence_relation *ddr;
595 unsigned int i;
597 if (dump_enabled_p ())
598 dump_printf_loc (MSG_NOTE, vect_location,
599 "=== vect_slp_analyze_data_ref_dependences ===\n");
601 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
602 &BB_VINFO_DDRS (bb_vinfo),
603 vNULL, true))
604 return false;
606 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
607 if (vect_slp_analyze_data_ref_dependence (ddr))
608 return false;
610 return true;
614 /* Function vect_compute_data_ref_alignment
616 Compute the misalignment of the data reference DR.
618 Output:
619 1. If during the misalignment computation it is found that the data reference
620 cannot be vectorized then false is returned.
621 2. DR_MISALIGNMENT (DR) is defined.
623 FOR NOW: No analysis is actually performed. Misalignment is calculated
624 only for trivial cases. TODO. */
626 static bool
627 vect_compute_data_ref_alignment (struct data_reference *dr)
629 gimple stmt = DR_STMT (dr);
630 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
631 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
632 struct loop *loop = NULL;
633 tree ref = DR_REF (dr);
634 tree vectype;
635 tree base, base_addr;
636 bool base_aligned;
637 tree misalign = NULL_TREE;
638 tree aligned_to;
639 unsigned HOST_WIDE_INT alignment;
641 if (dump_enabled_p ())
642 dump_printf_loc (MSG_NOTE, vect_location,
643 "vect_compute_data_ref_alignment:\n");
645 if (loop_vinfo)
646 loop = LOOP_VINFO_LOOP (loop_vinfo);
648 /* Initialize misalignment to unknown. */
649 SET_DR_MISALIGNMENT (dr, -1);
651 /* Strided accesses perform only component accesses, misalignment information
652 is irrelevant for them. */
653 if (STMT_VINFO_STRIDED_P (stmt_info)
654 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
655 return true;
657 if (tree_fits_shwi_p (DR_STEP (dr)))
658 misalign = DR_INIT (dr);
659 aligned_to = DR_ALIGNED_TO (dr);
660 base_addr = DR_BASE_ADDRESS (dr);
661 vectype = STMT_VINFO_VECTYPE (stmt_info);
663 /* In case the dataref is in an inner-loop of the loop that is being
664 vectorized (LOOP), we use the base and misalignment information
665 relative to the outer-loop (LOOP). This is ok only if the misalignment
666 stays the same throughout the execution of the inner-loop, which is why
667 we have to check that the stride of the dataref in the inner-loop evenly
668 divides by the vector size. */
669 if (loop && nested_in_vect_loop_p (loop, stmt))
671 tree step = DR_STEP (dr);
673 if (tree_fits_shwi_p (step)
674 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
676 if (dump_enabled_p ())
677 dump_printf_loc (MSG_NOTE, vect_location,
678 "inner step divides the vector-size.\n");
679 misalign = STMT_VINFO_DR_INIT (stmt_info);
680 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
681 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
683 else
685 if (dump_enabled_p ())
686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
687 "inner step doesn't divide the vector-size.\n");
688 misalign = NULL_TREE;
692 /* Similarly we can only use base and misalignment information relative to
693 an innermost loop if the misalignment stays the same throughout the
694 execution of the loop. As above, this is the case if the stride of
695 the dataref evenly divides by the vector size. */
696 else
698 tree step = DR_STEP (dr);
699 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
701 if (tree_fits_shwi_p (step)
702 && ((tree_to_shwi (step) * vf)
703 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
705 if (dump_enabled_p ())
706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
707 "step doesn't divide the vector-size.\n");
708 misalign = NULL_TREE;
712 alignment = TYPE_ALIGN_UNIT (vectype);
714 if ((compare_tree_int (aligned_to, alignment) < 0)
715 || !misalign)
717 if (dump_enabled_p ())
719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
720 "Unknown alignment for access: ");
721 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
722 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
724 return true;
727 /* To look at alignment of the base we have to preserve an inner MEM_REF
728 as that carries alignment information of the actual access. */
729 base = ref;
730 while (handled_component_p (base))
731 base = TREE_OPERAND (base, 0);
732 if (TREE_CODE (base) == MEM_REF)
733 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
734 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
736 if (get_object_alignment (base) >= TYPE_ALIGN (vectype))
737 base_aligned = true;
738 else
739 base_aligned = false;
741 if (!base_aligned)
743 /* Strip an inner MEM_REF to a bare decl if possible. */
744 if (TREE_CODE (base) == MEM_REF
745 && integer_zerop (TREE_OPERAND (base, 1))
746 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
747 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
749 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
751 if (dump_enabled_p ())
753 dump_printf_loc (MSG_NOTE, vect_location,
754 "can't force alignment of ref: ");
755 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
756 dump_printf (MSG_NOTE, "\n");
758 return true;
761 /* Force the alignment of the decl.
762 NOTE: This is the only change to the code we make during
763 the analysis phase, before deciding to vectorize the loop. */
764 if (dump_enabled_p ())
766 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
767 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
768 dump_printf (MSG_NOTE, "\n");
771 ((dataref_aux *)dr->aux)->base_decl = base;
772 ((dataref_aux *)dr->aux)->base_misaligned = true;
775 /* If this is a backward running DR then first access in the larger
776 vectype actually is N-1 elements before the address in the DR.
777 Adjust misalign accordingly. */
778 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
780 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
781 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
782 otherwise we wouldn't be here. */
783 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
784 /* PLUS because DR_STEP was negative. */
785 misalign = size_binop (PLUS_EXPR, misalign, offset);
788 SET_DR_MISALIGNMENT (dr,
789 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
791 if (dump_enabled_p ())
793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
794 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
795 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
796 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
799 return true;
803 /* Function vect_compute_data_refs_alignment
805 Compute the misalignment of data references in the loop.
806 Return FALSE if a data reference is found that cannot be vectorized. */
808 static bool
809 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
810 bb_vec_info bb_vinfo)
812 vec<data_reference_p> datarefs;
813 struct data_reference *dr;
814 unsigned int i;
816 if (loop_vinfo)
817 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
818 else
819 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
821 FOR_EACH_VEC_ELT (datarefs, i, dr)
822 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
823 && !vect_compute_data_ref_alignment (dr))
825 if (bb_vinfo)
827 /* Mark unsupported statement as unvectorizable. */
828 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
829 continue;
831 else
832 return false;
835 return true;
839 /* Function vect_update_misalignment_for_peel
841 DR - the data reference whose misalignment is to be adjusted.
842 DR_PEEL - the data reference whose misalignment is being made
843 zero in the vector loop by the peel.
844 NPEEL - the number of iterations in the peel loop if the misalignment
845 of DR_PEEL is known at compile time. */
847 static void
848 vect_update_misalignment_for_peel (struct data_reference *dr,
849 struct data_reference *dr_peel, int npeel)
851 unsigned int i;
852 vec<dr_p> same_align_drs;
853 struct data_reference *current_dr;
854 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
855 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
856 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
857 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
859 /* For interleaved data accesses the step in the loop must be multiplied by
860 the size of the interleaving group. */
861 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
862 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
863 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
864 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
866 /* It can be assumed that the data refs with the same alignment as dr_peel
867 are aligned in the vector loop. */
868 same_align_drs
869 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
870 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
872 if (current_dr != dr)
873 continue;
874 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
875 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
876 SET_DR_MISALIGNMENT (dr, 0);
877 return;
880 if (known_alignment_for_access_p (dr)
881 && known_alignment_for_access_p (dr_peel))
883 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
884 int misal = DR_MISALIGNMENT (dr);
885 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
886 misal += negative ? -npeel * dr_size : npeel * dr_size;
887 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
888 SET_DR_MISALIGNMENT (dr, misal);
889 return;
892 if (dump_enabled_p ())
893 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
894 SET_DR_MISALIGNMENT (dr, -1);
898 /* Function vect_verify_datarefs_alignment
900 Return TRUE if all data references in the loop can be
901 handled with respect to alignment. */
903 bool
904 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
906 vec<data_reference_p> datarefs;
907 struct data_reference *dr;
908 enum dr_alignment_support supportable_dr_alignment;
909 unsigned int i;
911 if (loop_vinfo)
912 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
913 else
914 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
916 FOR_EACH_VEC_ELT (datarefs, i, dr)
918 gimple stmt = DR_STMT (dr);
919 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
921 if (!STMT_VINFO_RELEVANT_P (stmt_info))
922 continue;
924 /* For interleaving, only the alignment of the first access matters.
925 Skip statements marked as not vectorizable. */
926 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
927 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
928 || !STMT_VINFO_VECTORIZABLE (stmt_info))
929 continue;
931 /* Strided accesses perform only component accesses, alignment is
932 irrelevant for them. */
933 if (STMT_VINFO_STRIDED_P (stmt_info)
934 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
935 continue;
937 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
938 if (!supportable_dr_alignment)
940 if (dump_enabled_p ())
942 if (DR_IS_READ (dr))
943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
944 "not vectorized: unsupported unaligned load.");
945 else
946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
947 "not vectorized: unsupported unaligned "
948 "store.");
950 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
951 DR_REF (dr));
952 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
954 return false;
956 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
957 dump_printf_loc (MSG_NOTE, vect_location,
958 "Vectorizing an unaligned access.\n");
960 return true;
963 /* Given an memory reference EXP return whether its alignment is less
964 than its size. */
966 static bool
967 not_size_aligned (tree exp)
969 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
970 return true;
972 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
973 > get_object_alignment (exp));
976 /* Function vector_alignment_reachable_p
978 Return true if vector alignment for DR is reachable by peeling
979 a few loop iterations. Return false otherwise. */
981 static bool
982 vector_alignment_reachable_p (struct data_reference *dr)
984 gimple stmt = DR_STMT (dr);
985 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
986 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
988 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
990 /* For interleaved access we peel only if number of iterations in
991 the prolog loop ({VF - misalignment}), is a multiple of the
992 number of the interleaved accesses. */
993 int elem_size, mis_in_elements;
994 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
996 /* FORNOW: handle only known alignment. */
997 if (!known_alignment_for_access_p (dr))
998 return false;
1000 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1001 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1003 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1004 return false;
1007 /* If misalignment is known at the compile time then allow peeling
1008 only if natural alignment is reachable through peeling. */
1009 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1011 HOST_WIDE_INT elmsize =
1012 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1013 if (dump_enabled_p ())
1015 dump_printf_loc (MSG_NOTE, vect_location,
1016 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1017 dump_printf (MSG_NOTE,
1018 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1020 if (DR_MISALIGNMENT (dr) % elmsize)
1022 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1024 "data size does not divide the misalignment.\n");
1025 return false;
1029 if (!known_alignment_for_access_p (dr))
1031 tree type = TREE_TYPE (DR_REF (dr));
1032 bool is_packed = not_size_aligned (DR_REF (dr));
1033 if (dump_enabled_p ())
1034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1035 "Unknown misalignment, is_packed = %d\n",is_packed);
1036 if ((TYPE_USER_ALIGN (type) && !is_packed)
1037 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1038 return true;
1039 else
1040 return false;
1043 return true;
1047 /* Calculate the cost of the memory access represented by DR. */
1049 static void
1050 vect_get_data_access_cost (struct data_reference *dr,
1051 unsigned int *inside_cost,
1052 unsigned int *outside_cost,
1053 stmt_vector_for_cost *body_cost_vec)
1055 gimple stmt = DR_STMT (dr);
1056 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1057 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1058 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1059 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1060 int ncopies = vf / nunits;
1062 if (DR_IS_READ (dr))
1063 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1064 NULL, body_cost_vec, false);
1065 else
1066 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1068 if (dump_enabled_p ())
1069 dump_printf_loc (MSG_NOTE, vect_location,
1070 "vect_get_data_access_cost: inside_cost = %d, "
1071 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1075 /* Insert DR into peeling hash table with NPEEL as key. */
1077 static void
1078 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1079 int npeel)
1081 struct _vect_peel_info elem, *slot;
1082 _vect_peel_info **new_slot;
1083 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1085 elem.npeel = npeel;
1086 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1087 if (slot)
1088 slot->count++;
1089 else
1091 slot = XNEW (struct _vect_peel_info);
1092 slot->npeel = npeel;
1093 slot->dr = dr;
1094 slot->count = 1;
1095 new_slot
1096 = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1097 *new_slot = slot;
1100 if (!supportable_dr_alignment
1101 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1102 slot->count += VECT_MAX_COST;
1106 /* Traverse peeling hash table to find peeling option that aligns maximum
1107 number of data accesses. */
1110 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1111 _vect_peel_extended_info *max)
1113 vect_peel_info elem = *slot;
1115 if (elem->count > max->peel_info.count
1116 || (elem->count == max->peel_info.count
1117 && max->peel_info.npeel > elem->npeel))
1119 max->peel_info.npeel = elem->npeel;
1120 max->peel_info.count = elem->count;
1121 max->peel_info.dr = elem->dr;
1124 return 1;
1128 /* Traverse peeling hash table and calculate cost for each peeling option.
1129 Find the one with the lowest cost. */
1132 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1133 _vect_peel_extended_info *min)
1135 vect_peel_info elem = *slot;
1136 int save_misalignment, dummy;
1137 unsigned int inside_cost = 0, outside_cost = 0, i;
1138 gimple stmt = DR_STMT (elem->dr);
1139 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1140 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1141 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1142 struct data_reference *dr;
1143 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1145 prologue_cost_vec.create (2);
1146 body_cost_vec.create (2);
1147 epilogue_cost_vec.create (2);
1149 FOR_EACH_VEC_ELT (datarefs, i, dr)
1151 stmt = DR_STMT (dr);
1152 stmt_info = vinfo_for_stmt (stmt);
1153 /* For interleaving, only the alignment of the first access
1154 matters. */
1155 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1156 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1157 continue;
1159 save_misalignment = DR_MISALIGNMENT (dr);
1160 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1161 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1162 &body_cost_vec);
1163 SET_DR_MISALIGNMENT (dr, save_misalignment);
1166 outside_cost += vect_get_known_peeling_cost
1167 (loop_vinfo, elem->npeel, &dummy,
1168 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1169 &prologue_cost_vec, &epilogue_cost_vec);
1171 /* Prologue and epilogue costs are added to the target model later.
1172 These costs depend only on the scalar iteration cost, the
1173 number of peeling iterations finally chosen, and the number of
1174 misaligned statements. So discard the information found here. */
1175 prologue_cost_vec.release ();
1176 epilogue_cost_vec.release ();
1178 if (inside_cost < min->inside_cost
1179 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1181 min->inside_cost = inside_cost;
1182 min->outside_cost = outside_cost;
1183 min->body_cost_vec.release ();
1184 min->body_cost_vec = body_cost_vec;
1185 min->peel_info.dr = elem->dr;
1186 min->peel_info.npeel = elem->npeel;
1188 else
1189 body_cost_vec.release ();
1191 return 1;
1195 /* Choose best peeling option by traversing peeling hash table and either
1196 choosing an option with the lowest cost (if cost model is enabled) or the
1197 option that aligns as many accesses as possible. */
1199 static struct data_reference *
1200 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1201 unsigned int *npeel,
1202 stmt_vector_for_cost *body_cost_vec)
1204 struct _vect_peel_extended_info res;
1206 res.peel_info.dr = NULL;
1207 res.body_cost_vec = stmt_vector_for_cost ();
1209 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1211 res.inside_cost = INT_MAX;
1212 res.outside_cost = INT_MAX;
1213 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1214 ->traverse <_vect_peel_extended_info *,
1215 vect_peeling_hash_get_lowest_cost> (&res);
1217 else
1219 res.peel_info.count = 0;
1220 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1221 ->traverse <_vect_peel_extended_info *,
1222 vect_peeling_hash_get_most_frequent> (&res);
1225 *npeel = res.peel_info.npeel;
1226 *body_cost_vec = res.body_cost_vec;
1227 return res.peel_info.dr;
1231 /* Function vect_enhance_data_refs_alignment
1233 This pass will use loop versioning and loop peeling in order to enhance
1234 the alignment of data references in the loop.
1236 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1237 original loop is to be vectorized. Any other loops that are created by
1238 the transformations performed in this pass - are not supposed to be
1239 vectorized. This restriction will be relaxed.
1241 This pass will require a cost model to guide it whether to apply peeling
1242 or versioning or a combination of the two. For example, the scheme that
1243 intel uses when given a loop with several memory accesses, is as follows:
1244 choose one memory access ('p') which alignment you want to force by doing
1245 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1246 other accesses are not necessarily aligned, or (2) use loop versioning to
1247 generate one loop in which all accesses are aligned, and another loop in
1248 which only 'p' is necessarily aligned.
1250 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1251 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1252 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1254 Devising a cost model is the most critical aspect of this work. It will
1255 guide us on which access to peel for, whether to use loop versioning, how
1256 many versions to create, etc. The cost model will probably consist of
1257 generic considerations as well as target specific considerations (on
1258 powerpc for example, misaligned stores are more painful than misaligned
1259 loads).
1261 Here are the general steps involved in alignment enhancements:
1263 -- original loop, before alignment analysis:
1264 for (i=0; i<N; i++){
1265 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1266 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1269 -- After vect_compute_data_refs_alignment:
1270 for (i=0; i<N; i++){
1271 x = q[i]; # DR_MISALIGNMENT(q) = 3
1272 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1275 -- Possibility 1: we do loop versioning:
1276 if (p is aligned) {
1277 for (i=0; i<N; i++){ # loop 1A
1278 x = q[i]; # DR_MISALIGNMENT(q) = 3
1279 p[i] = y; # DR_MISALIGNMENT(p) = 0
1282 else {
1283 for (i=0; i<N; i++){ # loop 1B
1284 x = q[i]; # DR_MISALIGNMENT(q) = 3
1285 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1289 -- Possibility 2: we do loop peeling:
1290 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1291 x = q[i];
1292 p[i] = y;
1294 for (i = 3; i < N; i++){ # loop 2A
1295 x = q[i]; # DR_MISALIGNMENT(q) = 0
1296 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1299 -- Possibility 3: combination of loop peeling and versioning:
1300 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1301 x = q[i];
1302 p[i] = y;
1304 if (p is aligned) {
1305 for (i = 3; i<N; i++){ # loop 3A
1306 x = q[i]; # DR_MISALIGNMENT(q) = 0
1307 p[i] = y; # DR_MISALIGNMENT(p) = 0
1310 else {
1311 for (i = 3; i<N; i++){ # loop 3B
1312 x = q[i]; # DR_MISALIGNMENT(q) = 0
1313 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1317 These loops are later passed to loop_transform to be vectorized. The
1318 vectorizer will use the alignment information to guide the transformation
1319 (whether to generate regular loads/stores, or with special handling for
1320 misalignment). */
1322 bool
1323 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1325 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1326 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1327 enum dr_alignment_support supportable_dr_alignment;
1328 struct data_reference *dr0 = NULL, *first_store = NULL;
1329 struct data_reference *dr;
1330 unsigned int i, j;
1331 bool do_peeling = false;
1332 bool do_versioning = false;
1333 bool stat;
1334 gimple stmt;
1335 stmt_vec_info stmt_info;
1336 unsigned int npeel = 0;
1337 bool all_misalignments_unknown = true;
1338 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1339 unsigned possible_npeel_number = 1;
1340 tree vectype;
1341 unsigned int nelements, mis, same_align_drs_max = 0;
1342 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1344 if (dump_enabled_p ())
1345 dump_printf_loc (MSG_NOTE, vect_location,
1346 "=== vect_enhance_data_refs_alignment ===\n");
1348 /* While cost model enhancements are expected in the future, the high level
1349 view of the code at this time is as follows:
1351 A) If there is a misaligned access then see if peeling to align
1352 this access can make all data references satisfy
1353 vect_supportable_dr_alignment. If so, update data structures
1354 as needed and return true.
1356 B) If peeling wasn't possible and there is a data reference with an
1357 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1358 then see if loop versioning checks can be used to make all data
1359 references satisfy vect_supportable_dr_alignment. If so, update
1360 data structures as needed and return true.
1362 C) If neither peeling nor versioning were successful then return false if
1363 any data reference does not satisfy vect_supportable_dr_alignment.
1365 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1367 Note, Possibility 3 above (which is peeling and versioning together) is not
1368 being done at this time. */
1370 /* (1) Peeling to force alignment. */
1372 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1373 Considerations:
1374 + How many accesses will become aligned due to the peeling
1375 - How many accesses will become unaligned due to the peeling,
1376 and the cost of misaligned accesses.
1377 - The cost of peeling (the extra runtime checks, the increase
1378 in code size). */
1380 FOR_EACH_VEC_ELT (datarefs, i, dr)
1382 stmt = DR_STMT (dr);
1383 stmt_info = vinfo_for_stmt (stmt);
1385 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1386 continue;
1388 /* For interleaving, only the alignment of the first access
1389 matters. */
1390 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1391 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1392 continue;
1394 /* For invariant accesses there is nothing to enhance. */
1395 if (integer_zerop (DR_STEP (dr)))
1396 continue;
1398 /* Strided accesses perform only component accesses, alignment is
1399 irrelevant for them. */
1400 if (STMT_VINFO_STRIDED_P (stmt_info)
1401 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1402 continue;
1404 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1405 do_peeling = vector_alignment_reachable_p (dr);
1406 if (do_peeling)
1408 if (known_alignment_for_access_p (dr))
1410 unsigned int npeel_tmp;
1411 bool negative = tree_int_cst_compare (DR_STEP (dr),
1412 size_zero_node) < 0;
1414 /* Save info about DR in the hash table. */
1415 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1416 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1417 = new hash_table<peel_info_hasher> (1);
1419 vectype = STMT_VINFO_VECTYPE (stmt_info);
1420 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1421 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1422 TREE_TYPE (DR_REF (dr))));
1423 npeel_tmp = (negative
1424 ? (mis - nelements) : (nelements - mis))
1425 & (nelements - 1);
1427 /* For multiple types, it is possible that the bigger type access
1428 will have more than one peeling option. E.g., a loop with two
1429 types: one of size (vector size / 4), and the other one of
1430 size (vector size / 8). Vectorization factor will 8. If both
1431 access are misaligned by 3, the first one needs one scalar
1432 iteration to be aligned, and the second one needs 5. But the
1433 the first one will be aligned also by peeling 5 scalar
1434 iterations, and in that case both accesses will be aligned.
1435 Hence, except for the immediate peeling amount, we also want
1436 to try to add full vector size, while we don't exceed
1437 vectorization factor.
1438 We do this automtically for cost model, since we calculate cost
1439 for every peeling option. */
1440 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1442 if (STMT_SLP_TYPE (stmt_info))
1443 possible_npeel_number
1444 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1445 else
1446 possible_npeel_number = vf / nelements;
1449 /* Handle the aligned case. We may decide to align some other
1450 access, making DR unaligned. */
1451 if (DR_MISALIGNMENT (dr) == 0)
1453 npeel_tmp = 0;
1454 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1455 possible_npeel_number++;
1458 for (j = 0; j < possible_npeel_number; j++)
1460 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1461 npeel_tmp += nelements;
1464 all_misalignments_unknown = false;
1465 /* Data-ref that was chosen for the case that all the
1466 misalignments are unknown is not relevant anymore, since we
1467 have a data-ref with known alignment. */
1468 dr0 = NULL;
1470 else
1472 /* If we don't know any misalignment values, we prefer
1473 peeling for data-ref that has the maximum number of data-refs
1474 with the same alignment, unless the target prefers to align
1475 stores over load. */
1476 if (all_misalignments_unknown)
1478 unsigned same_align_drs
1479 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1480 if (!dr0
1481 || same_align_drs_max < same_align_drs)
1483 same_align_drs_max = same_align_drs;
1484 dr0 = dr;
1486 /* For data-refs with the same number of related
1487 accesses prefer the one where the misalign
1488 computation will be invariant in the outermost loop. */
1489 else if (same_align_drs_max == same_align_drs)
1491 struct loop *ivloop0, *ivloop;
1492 ivloop0 = outermost_invariant_loop_for_expr
1493 (loop, DR_BASE_ADDRESS (dr0));
1494 ivloop = outermost_invariant_loop_for_expr
1495 (loop, DR_BASE_ADDRESS (dr));
1496 if ((ivloop && !ivloop0)
1497 || (ivloop && ivloop0
1498 && flow_loop_nested_p (ivloop, ivloop0)))
1499 dr0 = dr;
1502 if (!first_store && DR_IS_WRITE (dr))
1503 first_store = dr;
1506 /* If there are both known and unknown misaligned accesses in the
1507 loop, we choose peeling amount according to the known
1508 accesses. */
1509 if (!supportable_dr_alignment)
1511 dr0 = dr;
1512 if (!first_store && DR_IS_WRITE (dr))
1513 first_store = dr;
1517 else
1519 if (!aligned_access_p (dr))
1521 if (dump_enabled_p ())
1522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1523 "vector alignment may not be reachable\n");
1524 break;
1529 /* Check if we can possibly peel the loop. */
1530 if (!vect_can_advance_ivs_p (loop_vinfo)
1531 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1532 do_peeling = false;
1534 if (do_peeling
1535 && all_misalignments_unknown
1536 && vect_supportable_dr_alignment (dr0, false))
1538 /* Check if the target requires to prefer stores over loads, i.e., if
1539 misaligned stores are more expensive than misaligned loads (taking
1540 drs with same alignment into account). */
1541 if (first_store && DR_IS_READ (dr0))
1543 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1544 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1545 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1546 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1547 stmt_vector_for_cost dummy;
1548 dummy.create (2);
1550 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1551 &dummy);
1552 vect_get_data_access_cost (first_store, &store_inside_cost,
1553 &store_outside_cost, &dummy);
1555 dummy.release ();
1557 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1558 aligning the load DR0). */
1559 load_inside_penalty = store_inside_cost;
1560 load_outside_penalty = store_outside_cost;
1561 for (i = 0;
1562 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1563 DR_STMT (first_store))).iterate (i, &dr);
1564 i++)
1565 if (DR_IS_READ (dr))
1567 load_inside_penalty += load_inside_cost;
1568 load_outside_penalty += load_outside_cost;
1570 else
1572 load_inside_penalty += store_inside_cost;
1573 load_outside_penalty += store_outside_cost;
1576 /* Calculate the penalty for leaving DR0 unaligned (by
1577 aligning the FIRST_STORE). */
1578 store_inside_penalty = load_inside_cost;
1579 store_outside_penalty = load_outside_cost;
1580 for (i = 0;
1581 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1582 DR_STMT (dr0))).iterate (i, &dr);
1583 i++)
1584 if (DR_IS_READ (dr))
1586 store_inside_penalty += load_inside_cost;
1587 store_outside_penalty += load_outside_cost;
1589 else
1591 store_inside_penalty += store_inside_cost;
1592 store_outside_penalty += store_outside_cost;
1595 if (load_inside_penalty > store_inside_penalty
1596 || (load_inside_penalty == store_inside_penalty
1597 && load_outside_penalty > store_outside_penalty))
1598 dr0 = first_store;
1601 /* In case there are only loads with different unknown misalignments, use
1602 peeling only if it may help to align other accesses in the loop or
1603 if it may help improving load bandwith when we'd end up using
1604 unaligned loads. */
1605 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1606 if (!first_store
1607 && !STMT_VINFO_SAME_ALIGN_REFS (
1608 vinfo_for_stmt (DR_STMT (dr0))).length ()
1609 && (vect_supportable_dr_alignment (dr0, false)
1610 != dr_unaligned_supported
1611 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1612 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1613 do_peeling = false;
1616 if (do_peeling && !dr0)
1618 /* Peeling is possible, but there is no data access that is not supported
1619 unless aligned. So we try to choose the best possible peeling. */
1621 /* We should get here only if there are drs with known misalignment. */
1622 gcc_assert (!all_misalignments_unknown);
1624 /* Choose the best peeling from the hash table. */
1625 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1626 &body_cost_vec);
1627 if (!dr0 || !npeel)
1628 do_peeling = false;
1631 if (do_peeling)
1633 stmt = DR_STMT (dr0);
1634 stmt_info = vinfo_for_stmt (stmt);
1635 vectype = STMT_VINFO_VECTYPE (stmt_info);
1636 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1638 if (known_alignment_for_access_p (dr0))
1640 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1641 size_zero_node) < 0;
1642 if (!npeel)
1644 /* Since it's known at compile time, compute the number of
1645 iterations in the peeled loop (the peeling factor) for use in
1646 updating DR_MISALIGNMENT values. The peeling factor is the
1647 vectorization factor minus the misalignment as an element
1648 count. */
1649 mis = DR_MISALIGNMENT (dr0);
1650 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1651 npeel = ((negative ? mis - nelements : nelements - mis)
1652 & (nelements - 1));
1655 /* For interleaved data access every iteration accesses all the
1656 members of the group, therefore we divide the number of iterations
1657 by the group size. */
1658 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1659 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1660 npeel /= GROUP_SIZE (stmt_info);
1662 if (dump_enabled_p ())
1663 dump_printf_loc (MSG_NOTE, vect_location,
1664 "Try peeling by %d\n", npeel);
1667 /* Ensure that all data refs can be vectorized after the peel. */
1668 FOR_EACH_VEC_ELT (datarefs, i, dr)
1670 int save_misalignment;
1672 if (dr == dr0)
1673 continue;
1675 stmt = DR_STMT (dr);
1676 stmt_info = vinfo_for_stmt (stmt);
1677 /* For interleaving, only the alignment of the first access
1678 matters. */
1679 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1680 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1681 continue;
1683 /* Strided accesses perform only component accesses, alignment is
1684 irrelevant for them. */
1685 if (STMT_VINFO_STRIDED_P (stmt_info)
1686 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1687 continue;
1689 save_misalignment = DR_MISALIGNMENT (dr);
1690 vect_update_misalignment_for_peel (dr, dr0, npeel);
1691 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1692 SET_DR_MISALIGNMENT (dr, save_misalignment);
1694 if (!supportable_dr_alignment)
1696 do_peeling = false;
1697 break;
1701 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1703 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1704 if (!stat)
1705 do_peeling = false;
1706 else
1708 body_cost_vec.release ();
1709 return stat;
1713 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1714 if (do_peeling)
1716 unsigned max_allowed_peel
1717 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1718 if (max_allowed_peel != (unsigned)-1)
1720 unsigned max_peel = npeel;
1721 if (max_peel == 0)
1723 gimple dr_stmt = DR_STMT (dr0);
1724 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1725 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1726 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1728 if (max_peel > max_allowed_peel)
1730 do_peeling = false;
1731 if (dump_enabled_p ())
1732 dump_printf_loc (MSG_NOTE, vect_location,
1733 "Disable peeling, max peels reached: %d\n", max_peel);
1738 /* Cost model #2 - if peeling may result in a remaining loop not
1739 iterating enough to be vectorized then do not peel. */
1740 if (do_peeling
1741 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1743 unsigned max_peel
1744 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1745 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1746 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1747 do_peeling = false;
1750 if (do_peeling)
1752 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1753 If the misalignment of DR_i is identical to that of dr0 then set
1754 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1755 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1756 by the peeling factor times the element size of DR_i (MOD the
1757 vectorization factor times the size). Otherwise, the
1758 misalignment of DR_i must be set to unknown. */
1759 FOR_EACH_VEC_ELT (datarefs, i, dr)
1760 if (dr != dr0)
1761 vect_update_misalignment_for_peel (dr, dr0, npeel);
1763 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1764 if (npeel)
1765 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1766 else
1767 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1768 = DR_MISALIGNMENT (dr0);
1769 SET_DR_MISALIGNMENT (dr0, 0);
1770 if (dump_enabled_p ())
1772 dump_printf_loc (MSG_NOTE, vect_location,
1773 "Alignment of access forced using peeling.\n");
1774 dump_printf_loc (MSG_NOTE, vect_location,
1775 "Peeling for alignment will be applied.\n");
1777 /* The inside-loop cost will be accounted for in vectorizable_load
1778 and vectorizable_store correctly with adjusted alignments.
1779 Drop the body_cst_vec on the floor here. */
1780 body_cost_vec.release ();
1782 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1783 gcc_assert (stat);
1784 return stat;
1788 body_cost_vec.release ();
1790 /* (2) Versioning to force alignment. */
1792 /* Try versioning if:
1793 1) optimize loop for speed
1794 2) there is at least one unsupported misaligned data ref with an unknown
1795 misalignment, and
1796 3) all misaligned data refs with a known misalignment are supported, and
1797 4) the number of runtime alignment checks is within reason. */
1799 do_versioning =
1800 optimize_loop_nest_for_speed_p (loop)
1801 && (!loop->inner); /* FORNOW */
1803 if (do_versioning)
1805 FOR_EACH_VEC_ELT (datarefs, i, dr)
1807 stmt = DR_STMT (dr);
1808 stmt_info = vinfo_for_stmt (stmt);
1810 /* For interleaving, only the alignment of the first access
1811 matters. */
1812 if (aligned_access_p (dr)
1813 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1814 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1815 continue;
1817 if (STMT_VINFO_STRIDED_P (stmt_info))
1819 /* Strided loads perform only component accesses, alignment is
1820 irrelevant for them. */
1821 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1822 continue;
1823 do_versioning = false;
1824 break;
1827 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1829 if (!supportable_dr_alignment)
1831 gimple stmt;
1832 int mask;
1833 tree vectype;
1835 if (known_alignment_for_access_p (dr)
1836 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1837 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1839 do_versioning = false;
1840 break;
1843 stmt = DR_STMT (dr);
1844 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1845 gcc_assert (vectype);
1847 /* The rightmost bits of an aligned address must be zeros.
1848 Construct the mask needed for this test. For example,
1849 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1850 mask must be 15 = 0xf. */
1851 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1853 /* FORNOW: use the same mask to test all potentially unaligned
1854 references in the loop. The vectorizer currently supports
1855 a single vector size, see the reference to
1856 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1857 vectorization factor is computed. */
1858 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1859 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1860 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1861 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1862 DR_STMT (dr));
1866 /* Versioning requires at least one misaligned data reference. */
1867 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1868 do_versioning = false;
1869 else if (!do_versioning)
1870 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1873 if (do_versioning)
1875 vec<gimple> may_misalign_stmts
1876 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1877 gimple stmt;
1879 /* It can now be assumed that the data references in the statements
1880 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1881 of the loop being vectorized. */
1882 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1884 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1885 dr = STMT_VINFO_DATA_REF (stmt_info);
1886 SET_DR_MISALIGNMENT (dr, 0);
1887 if (dump_enabled_p ())
1888 dump_printf_loc (MSG_NOTE, vect_location,
1889 "Alignment of access forced using versioning.\n");
1892 if (dump_enabled_p ())
1893 dump_printf_loc (MSG_NOTE, vect_location,
1894 "Versioning for alignment will be applied.\n");
1896 /* Peeling and versioning can't be done together at this time. */
1897 gcc_assert (! (do_peeling && do_versioning));
1899 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1900 gcc_assert (stat);
1901 return stat;
1904 /* This point is reached if neither peeling nor versioning is being done. */
1905 gcc_assert (! (do_peeling || do_versioning));
1907 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1908 return stat;
1912 /* Function vect_find_same_alignment_drs.
1914 Update group and alignment relations according to the chosen
1915 vectorization factor. */
1917 static void
1918 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1919 loop_vec_info loop_vinfo)
1921 unsigned int i;
1922 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1923 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1924 struct data_reference *dra = DDR_A (ddr);
1925 struct data_reference *drb = DDR_B (ddr);
1926 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1927 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1928 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1929 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1930 lambda_vector dist_v;
1931 unsigned int loop_depth;
1933 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1934 return;
1936 if (dra == drb)
1937 return;
1939 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1940 return;
1942 /* Loop-based vectorization and known data dependence. */
1943 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1944 return;
1946 /* Data-dependence analysis reports a distance vector of zero
1947 for data-references that overlap only in the first iteration
1948 but have different sign step (see PR45764).
1949 So as a sanity check require equal DR_STEP. */
1950 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1951 return;
1953 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1954 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1956 int dist = dist_v[loop_depth];
1958 if (dump_enabled_p ())
1959 dump_printf_loc (MSG_NOTE, vect_location,
1960 "dependence distance = %d.\n", dist);
1962 /* Same loop iteration. */
1963 if (dist == 0
1964 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1966 /* Two references with distance zero have the same alignment. */
1967 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1968 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1969 if (dump_enabled_p ())
1971 dump_printf_loc (MSG_NOTE, vect_location,
1972 "accesses have the same alignment.\n");
1973 dump_printf (MSG_NOTE,
1974 "dependence distance modulo vf == 0 between ");
1975 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1976 dump_printf (MSG_NOTE, " and ");
1977 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1978 dump_printf (MSG_NOTE, "\n");
1985 /* Function vect_analyze_data_refs_alignment
1987 Analyze the alignment of the data-references in the loop.
1988 Return FALSE if a data reference is found that cannot be vectorized. */
1990 bool
1991 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1992 bb_vec_info bb_vinfo)
1994 if (dump_enabled_p ())
1995 dump_printf_loc (MSG_NOTE, vect_location,
1996 "=== vect_analyze_data_refs_alignment ===\n");
1998 /* Mark groups of data references with same alignment using
1999 data dependence information. */
2000 if (loop_vinfo)
2002 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2003 struct data_dependence_relation *ddr;
2004 unsigned int i;
2006 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2007 vect_find_same_alignment_drs (ddr, loop_vinfo);
2010 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2012 if (dump_enabled_p ())
2013 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2014 "not vectorized: can't calculate alignment "
2015 "for data ref.\n");
2016 return false;
2019 return true;
2023 /* Analyze groups of accesses: check that DR belongs to a group of
2024 accesses of legal size, step, etc. Detect gaps, single element
2025 interleaving, and other special cases. Set grouped access info.
2026 Collect groups of strided stores for further use in SLP analysis. */
2028 static bool
2029 vect_analyze_group_access (struct data_reference *dr)
2031 tree step = DR_STEP (dr);
2032 tree scalar_type = TREE_TYPE (DR_REF (dr));
2033 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2034 gimple stmt = DR_STMT (dr);
2035 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2036 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2037 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2038 HOST_WIDE_INT dr_step = -1;
2039 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2040 bool slp_impossible = false;
2041 struct loop *loop = NULL;
2043 if (loop_vinfo)
2044 loop = LOOP_VINFO_LOOP (loop_vinfo);
2046 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2047 size of the interleaving group (including gaps). */
2048 if (tree_fits_shwi_p (step))
2050 dr_step = tree_to_shwi (step);
2051 groupsize = absu_hwi (dr_step) / type_size;
2053 else
2054 groupsize = 0;
2056 /* Not consecutive access is possible only if it is a part of interleaving. */
2057 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2059 /* Check if it this DR is a part of interleaving, and is a single
2060 element of the group that is accessed in the loop. */
2062 /* Gaps are supported only for loads. STEP must be a multiple of the type
2063 size. The size of the group must be a power of 2. */
2064 if (DR_IS_READ (dr)
2065 && (dr_step % type_size) == 0
2066 && groupsize > 0
2067 && exact_log2 (groupsize) != -1)
2069 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2070 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2071 if (dump_enabled_p ())
2073 dump_printf_loc (MSG_NOTE, vect_location,
2074 "Detected single element interleaving ");
2075 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2076 dump_printf (MSG_NOTE, " step ");
2077 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2078 dump_printf (MSG_NOTE, "\n");
2081 if (loop_vinfo)
2083 if (dump_enabled_p ())
2084 dump_printf_loc (MSG_NOTE, vect_location,
2085 "Data access with gaps requires scalar "
2086 "epilogue loop\n");
2087 if (loop->inner)
2089 if (dump_enabled_p ())
2090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2091 "Peeling for outer loop is not"
2092 " supported\n");
2093 return false;
2096 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2099 return true;
2102 if (dump_enabled_p ())
2104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2105 "not consecutive access ");
2106 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2107 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2110 if (bb_vinfo)
2112 /* Mark the statement as unvectorizable. */
2113 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2114 return true;
2117 return false;
2120 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2122 /* First stmt in the interleaving chain. Check the chain. */
2123 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2124 struct data_reference *data_ref = dr;
2125 unsigned int count = 1;
2126 tree prev_init = DR_INIT (data_ref);
2127 gimple prev = stmt;
2128 HOST_WIDE_INT diff, gaps = 0;
2130 while (next)
2132 /* Skip same data-refs. In case that two or more stmts share
2133 data-ref (supported only for loads), we vectorize only the first
2134 stmt, and the rest get their vectorized loads from the first
2135 one. */
2136 if (!tree_int_cst_compare (DR_INIT (data_ref),
2137 DR_INIT (STMT_VINFO_DATA_REF (
2138 vinfo_for_stmt (next)))))
2140 if (DR_IS_WRITE (data_ref))
2142 if (dump_enabled_p ())
2143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2144 "Two store stmts share the same dr.\n");
2145 return false;
2148 /* For load use the same data-ref load. */
2149 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2151 prev = next;
2152 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2153 continue;
2156 prev = next;
2157 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2159 /* All group members have the same STEP by construction. */
2160 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2162 /* Check that the distance between two accesses is equal to the type
2163 size. Otherwise, we have gaps. */
2164 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2165 - TREE_INT_CST_LOW (prev_init)) / type_size;
2166 if (diff != 1)
2168 /* FORNOW: SLP of accesses with gaps is not supported. */
2169 slp_impossible = true;
2170 if (DR_IS_WRITE (data_ref))
2172 if (dump_enabled_p ())
2173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2174 "interleaved store with gaps\n");
2175 return false;
2178 gaps += diff - 1;
2181 last_accessed_element += diff;
2183 /* Store the gap from the previous member of the group. If there is no
2184 gap in the access, GROUP_GAP is always 1. */
2185 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2187 prev_init = DR_INIT (data_ref);
2188 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2189 /* Count the number of data-refs in the chain. */
2190 count++;
2193 if (groupsize == 0)
2194 groupsize = count + gaps;
2196 /* Check that the size of the interleaving is equal to count for stores,
2197 i.e., that there are no gaps. */
2198 if (groupsize != count
2199 && !DR_IS_READ (dr))
2201 if (dump_enabled_p ())
2202 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2203 "interleaved store with gaps\n");
2204 return false;
2207 /* If there is a gap after the last load in the group it is the
2208 difference between the groupsize and the last accessed
2209 element.
2210 When there is no gap, this difference should be 0. */
2211 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2213 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2214 if (dump_enabled_p ())
2216 dump_printf_loc (MSG_NOTE, vect_location,
2217 "Detected interleaving of size %d starting with ",
2218 (int)groupsize);
2219 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2220 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2221 dump_printf_loc (MSG_NOTE, vect_location,
2222 "There is a gap of %d elements after the group\n",
2223 (int)GROUP_GAP (vinfo_for_stmt (stmt)));
2226 /* SLP: create an SLP data structure for every interleaving group of
2227 stores for further analysis in vect_analyse_slp. */
2228 if (DR_IS_WRITE (dr) && !slp_impossible)
2230 if (loop_vinfo)
2231 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2232 if (bb_vinfo)
2233 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2236 /* If there is a gap in the end of the group or the group size cannot
2237 be made a multiple of the vector element count then we access excess
2238 elements in the last iteration and thus need to peel that off. */
2239 if (loop_vinfo
2240 && (groupsize - last_accessed_element > 0
2241 || exact_log2 (groupsize) == -1))
2244 if (dump_enabled_p ())
2245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246 "Data access with gaps requires scalar "
2247 "epilogue loop\n");
2248 if (loop->inner)
2250 if (dump_enabled_p ())
2251 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2252 "Peeling for outer loop is not supported\n");
2253 return false;
2256 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2260 return true;
2264 /* Analyze the access pattern of the data-reference DR.
2265 In case of non-consecutive accesses call vect_analyze_group_access() to
2266 analyze groups of accesses. */
2268 static bool
2269 vect_analyze_data_ref_access (struct data_reference *dr)
2271 tree step = DR_STEP (dr);
2272 tree scalar_type = TREE_TYPE (DR_REF (dr));
2273 gimple stmt = DR_STMT (dr);
2274 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2275 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2276 struct loop *loop = NULL;
2278 if (loop_vinfo)
2279 loop = LOOP_VINFO_LOOP (loop_vinfo);
2281 if (loop_vinfo && !step)
2283 if (dump_enabled_p ())
2284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285 "bad data-ref access in loop\n");
2286 return false;
2289 /* Allow loads with zero step in inner-loop vectorization. */
2290 if (loop_vinfo && integer_zerop (step))
2292 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2293 if (!nested_in_vect_loop_p (loop, stmt))
2294 return DR_IS_READ (dr);
2295 /* Allow references with zero step for outer loops marked
2296 with pragma omp simd only - it guarantees absence of
2297 loop-carried dependencies between inner loop iterations. */
2298 if (!loop->force_vectorize)
2300 if (dump_enabled_p ())
2301 dump_printf_loc (MSG_NOTE, vect_location,
2302 "zero step in inner loop of nest\n");
2303 return false;
2307 if (loop && nested_in_vect_loop_p (loop, stmt))
2309 /* Interleaved accesses are not yet supported within outer-loop
2310 vectorization for references in the inner-loop. */
2311 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2313 /* For the rest of the analysis we use the outer-loop step. */
2314 step = STMT_VINFO_DR_STEP (stmt_info);
2315 if (integer_zerop (step))
2317 if (dump_enabled_p ())
2318 dump_printf_loc (MSG_NOTE, vect_location,
2319 "zero step in outer loop.\n");
2320 if (DR_IS_READ (dr))
2321 return true;
2322 else
2323 return false;
2327 /* Consecutive? */
2328 if (TREE_CODE (step) == INTEGER_CST)
2330 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2331 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2332 || (dr_step < 0
2333 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2335 /* Mark that it is not interleaving. */
2336 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2337 return true;
2341 if (loop && nested_in_vect_loop_p (loop, stmt))
2343 if (dump_enabled_p ())
2344 dump_printf_loc (MSG_NOTE, vect_location,
2345 "grouped access in outer loop.\n");
2346 return false;
2350 /* Assume this is a DR handled by non-constant strided load case. */
2351 if (TREE_CODE (step) != INTEGER_CST)
2352 return (STMT_VINFO_STRIDED_P (stmt_info)
2353 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2354 || vect_analyze_group_access (dr)));
2356 /* Not consecutive access - check if it's a part of interleaving group. */
2357 return vect_analyze_group_access (dr);
2362 /* A helper function used in the comparator function to sort data
2363 references. T1 and T2 are two data references to be compared.
2364 The function returns -1, 0, or 1. */
2366 static int
2367 compare_tree (tree t1, tree t2)
2369 int i, cmp;
2370 enum tree_code code;
2371 char tclass;
2373 if (t1 == t2)
2374 return 0;
2375 if (t1 == NULL)
2376 return -1;
2377 if (t2 == NULL)
2378 return 1;
2381 if (TREE_CODE (t1) != TREE_CODE (t2))
2382 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2384 code = TREE_CODE (t1);
2385 switch (code)
2387 /* For const values, we can just use hash values for comparisons. */
2388 case INTEGER_CST:
2389 case REAL_CST:
2390 case FIXED_CST:
2391 case STRING_CST:
2392 case COMPLEX_CST:
2393 case VECTOR_CST:
2395 hashval_t h1 = iterative_hash_expr (t1, 0);
2396 hashval_t h2 = iterative_hash_expr (t2, 0);
2397 if (h1 != h2)
2398 return h1 < h2 ? -1 : 1;
2399 break;
2402 case SSA_NAME:
2403 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2404 if (cmp != 0)
2405 return cmp;
2407 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2408 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2409 break;
2411 default:
2412 tclass = TREE_CODE_CLASS (code);
2414 /* For var-decl, we could compare their UIDs. */
2415 if (tclass == tcc_declaration)
2417 if (DECL_UID (t1) != DECL_UID (t2))
2418 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2419 break;
2422 /* For expressions with operands, compare their operands recursively. */
2423 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2425 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2426 if (cmp != 0)
2427 return cmp;
2431 return 0;
2435 /* Compare two data-references DRA and DRB to group them into chunks
2436 suitable for grouping. */
2438 static int
2439 dr_group_sort_cmp (const void *dra_, const void *drb_)
2441 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2442 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2443 int cmp;
2445 /* Stabilize sort. */
2446 if (dra == drb)
2447 return 0;
2449 /* Ordering of DRs according to base. */
2450 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2452 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2453 if (cmp != 0)
2454 return cmp;
2457 /* And according to DR_OFFSET. */
2458 if (!dr_equal_offsets_p (dra, drb))
2460 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2461 if (cmp != 0)
2462 return cmp;
2465 /* Put reads before writes. */
2466 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2467 return DR_IS_READ (dra) ? -1 : 1;
2469 /* Then sort after access size. */
2470 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2471 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2473 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2474 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2475 if (cmp != 0)
2476 return cmp;
2479 /* And after step. */
2480 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2482 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2483 if (cmp != 0)
2484 return cmp;
2487 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2488 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2489 if (cmp == 0)
2490 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2491 return cmp;
2494 /* Function vect_analyze_data_ref_accesses.
2496 Analyze the access pattern of all the data references in the loop.
2498 FORNOW: the only access pattern that is considered vectorizable is a
2499 simple step 1 (consecutive) access.
2501 FORNOW: handle only arrays and pointer accesses. */
2503 bool
2504 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2506 unsigned int i;
2507 vec<data_reference_p> datarefs;
2508 struct data_reference *dr;
2510 if (dump_enabled_p ())
2511 dump_printf_loc (MSG_NOTE, vect_location,
2512 "=== vect_analyze_data_ref_accesses ===\n");
2514 if (loop_vinfo)
2515 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2516 else
2517 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2519 if (datarefs.is_empty ())
2520 return true;
2522 /* Sort the array of datarefs to make building the interleaving chains
2523 linear. Don't modify the original vector's order, it is needed for
2524 determining what dependencies are reversed. */
2525 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2526 datarefs_copy.qsort (dr_group_sort_cmp);
2528 /* Build the interleaving chains. */
2529 for (i = 0; i < datarefs_copy.length () - 1;)
2531 data_reference_p dra = datarefs_copy[i];
2532 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2533 stmt_vec_info lastinfo = NULL;
2534 for (i = i + 1; i < datarefs_copy.length (); ++i)
2536 data_reference_p drb = datarefs_copy[i];
2537 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2539 /* ??? Imperfect sorting (non-compatible types, non-modulo
2540 accesses, same accesses) can lead to a group to be artificially
2541 split here as we don't just skip over those. If it really
2542 matters we can push those to a worklist and re-iterate
2543 over them. The we can just skip ahead to the next DR here. */
2545 /* Check that the data-refs have same first location (except init)
2546 and they are both either store or load (not load and store,
2547 not masked loads or stores). */
2548 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2549 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2550 DR_BASE_ADDRESS (drb), 0)
2551 || !dr_equal_offsets_p (dra, drb)
2552 || !gimple_assign_single_p (DR_STMT (dra))
2553 || !gimple_assign_single_p (DR_STMT (drb)))
2554 break;
2556 /* Check that the data-refs have the same constant size. */
2557 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2558 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2559 if (!tree_fits_uhwi_p (sza)
2560 || !tree_fits_uhwi_p (szb)
2561 || !tree_int_cst_equal (sza, szb))
2562 break;
2564 /* Check that the data-refs have the same step. */
2565 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2566 break;
2568 /* Do not place the same access in the interleaving chain twice. */
2569 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2570 break;
2572 /* Check the types are compatible.
2573 ??? We don't distinguish this during sorting. */
2574 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2575 TREE_TYPE (DR_REF (drb))))
2576 break;
2578 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2579 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2580 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2581 gcc_assert (init_a < init_b);
2583 /* If init_b == init_a + the size of the type * k, we have an
2584 interleaving, and DRA is accessed before DRB. */
2585 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2586 if ((init_b - init_a) % type_size_a != 0)
2587 break;
2589 /* If we have a store, the accesses are adjacent. This splits
2590 groups into chunks we support (we don't support vectorization
2591 of stores with gaps). */
2592 if (!DR_IS_READ (dra)
2593 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2594 (DR_INIT (datarefs_copy[i-1]))
2595 != type_size_a))
2596 break;
2598 /* If the step (if not zero or non-constant) is greater than the
2599 difference between data-refs' inits this splits groups into
2600 suitable sizes. */
2601 if (tree_fits_shwi_p (DR_STEP (dra)))
2603 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2604 if (step != 0 && step <= (init_b - init_a))
2605 break;
2608 if (dump_enabled_p ())
2610 dump_printf_loc (MSG_NOTE, vect_location,
2611 "Detected interleaving ");
2612 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2613 dump_printf (MSG_NOTE, " and ");
2614 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2615 dump_printf (MSG_NOTE, "\n");
2618 /* Link the found element into the group list. */
2619 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2621 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2622 lastinfo = stmtinfo_a;
2624 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2625 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2626 lastinfo = stmtinfo_b;
2630 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2631 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2632 && !vect_analyze_data_ref_access (dr))
2634 if (dump_enabled_p ())
2635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2636 "not vectorized: complicated access pattern.\n");
2638 if (bb_vinfo)
2640 /* Mark the statement as not vectorizable. */
2641 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2642 continue;
2644 else
2646 datarefs_copy.release ();
2647 return false;
2651 datarefs_copy.release ();
2652 return true;
2656 /* Operator == between two dr_with_seg_len objects.
2658 This equality operator is used to make sure two data refs
2659 are the same one so that we will consider to combine the
2660 aliasing checks of those two pairs of data dependent data
2661 refs. */
2663 static bool
2664 operator == (const dr_with_seg_len& d1,
2665 const dr_with_seg_len& d2)
2667 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2668 DR_BASE_ADDRESS (d2.dr), 0)
2669 && compare_tree (d1.offset, d2.offset) == 0
2670 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2673 /* Function comp_dr_with_seg_len_pair.
2675 Comparison function for sorting objects of dr_with_seg_len_pair_t
2676 so that we can combine aliasing checks in one scan. */
2678 static int
2679 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2681 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2682 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2684 const dr_with_seg_len &p11 = p1->first,
2685 &p12 = p1->second,
2686 &p21 = p2->first,
2687 &p22 = p2->second;
2689 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2690 if a and c have the same basic address snd step, and b and d have the same
2691 address and step. Therefore, if any a&c or b&d don't have the same address
2692 and step, we don't care the order of those two pairs after sorting. */
2693 int comp_res;
2695 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2696 DR_BASE_ADDRESS (p21.dr))) != 0)
2697 return comp_res;
2698 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2699 DR_BASE_ADDRESS (p22.dr))) != 0)
2700 return comp_res;
2701 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2702 return comp_res;
2703 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2704 return comp_res;
2705 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2706 return comp_res;
2707 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2708 return comp_res;
2710 return 0;
2713 /* Function vect_vfa_segment_size.
2715 Create an expression that computes the size of segment
2716 that will be accessed for a data reference. The functions takes into
2717 account that realignment loads may access one more vector.
2719 Input:
2720 DR: The data reference.
2721 LENGTH_FACTOR: segment length to consider.
2723 Return an expression whose value is the size of segment which will be
2724 accessed by DR. */
2726 static tree
2727 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2729 tree segment_length;
2731 if (integer_zerop (DR_STEP (dr)))
2732 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2733 else
2734 segment_length = size_binop (MULT_EXPR,
2735 fold_convert (sizetype, DR_STEP (dr)),
2736 fold_convert (sizetype, length_factor));
2738 if (vect_supportable_dr_alignment (dr, false)
2739 == dr_explicit_realign_optimized)
2741 tree vector_size = TYPE_SIZE_UNIT
2742 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2744 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2746 return segment_length;
2749 /* Function vect_prune_runtime_alias_test_list.
2751 Prune a list of ddrs to be tested at run-time by versioning for alias.
2752 Merge several alias checks into one if possible.
2753 Return FALSE if resulting list of ddrs is longer then allowed by
2754 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2756 bool
2757 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2759 vec<ddr_p> may_alias_ddrs =
2760 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2761 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2762 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2763 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2764 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2766 ddr_p ddr;
2767 unsigned int i;
2768 tree length_factor;
2770 if (dump_enabled_p ())
2771 dump_printf_loc (MSG_NOTE, vect_location,
2772 "=== vect_prune_runtime_alias_test_list ===\n");
2774 if (may_alias_ddrs.is_empty ())
2775 return true;
2777 /* Basically, for each pair of dependent data refs store_ptr_0
2778 and load_ptr_0, we create an expression:
2780 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2781 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2783 for aliasing checks. However, in some cases we can decrease
2784 the number of checks by combining two checks into one. For
2785 example, suppose we have another pair of data refs store_ptr_0
2786 and load_ptr_1, and if the following condition is satisfied:
2788 load_ptr_0 < load_ptr_1 &&
2789 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2791 (this condition means, in each iteration of vectorized loop,
2792 the accessed memory of store_ptr_0 cannot be between the memory
2793 of load_ptr_0 and load_ptr_1.)
2795 we then can use only the following expression to finish the
2796 alising checks between store_ptr_0 & load_ptr_0 and
2797 store_ptr_0 & load_ptr_1:
2799 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2800 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2802 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2803 same basic address. */
2805 comp_alias_ddrs.create (may_alias_ddrs.length ());
2807 /* First, we collect all data ref pairs for aliasing checks. */
2808 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2810 struct data_reference *dr_a, *dr_b;
2811 gimple dr_group_first_a, dr_group_first_b;
2812 tree segment_length_a, segment_length_b;
2813 gimple stmt_a, stmt_b;
2815 dr_a = DDR_A (ddr);
2816 stmt_a = DR_STMT (DDR_A (ddr));
2817 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2818 if (dr_group_first_a)
2820 stmt_a = dr_group_first_a;
2821 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2824 dr_b = DDR_B (ddr);
2825 stmt_b = DR_STMT (DDR_B (ddr));
2826 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2827 if (dr_group_first_b)
2829 stmt_b = dr_group_first_b;
2830 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2833 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2834 length_factor = scalar_loop_iters;
2835 else
2836 length_factor = size_int (vect_factor);
2837 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2838 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2840 dr_with_seg_len_pair_t dr_with_seg_len_pair
2841 (dr_with_seg_len (dr_a, segment_length_a),
2842 dr_with_seg_len (dr_b, segment_length_b));
2844 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2845 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2847 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2850 /* Second, we sort the collected data ref pairs so that we can scan
2851 them once to combine all possible aliasing checks. */
2852 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2854 /* Third, we scan the sorted dr pairs and check if we can combine
2855 alias checks of two neighbouring dr pairs. */
2856 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2858 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2859 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2860 *dr_b1 = &comp_alias_ddrs[i-1].second,
2861 *dr_a2 = &comp_alias_ddrs[i].first,
2862 *dr_b2 = &comp_alias_ddrs[i].second;
2864 /* Remove duplicate data ref pairs. */
2865 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2867 if (dump_enabled_p ())
2869 dump_printf_loc (MSG_NOTE, vect_location,
2870 "found equal ranges ");
2871 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2872 DR_REF (dr_a1->dr));
2873 dump_printf (MSG_NOTE, ", ");
2874 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2875 DR_REF (dr_b1->dr));
2876 dump_printf (MSG_NOTE, " and ");
2877 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2878 DR_REF (dr_a2->dr));
2879 dump_printf (MSG_NOTE, ", ");
2880 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2881 DR_REF (dr_b2->dr));
2882 dump_printf (MSG_NOTE, "\n");
2885 comp_alias_ddrs.ordered_remove (i--);
2886 continue;
2889 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2891 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2892 and DR_A1 and DR_A2 are two consecutive memrefs. */
2893 if (*dr_a1 == *dr_a2)
2895 std::swap (dr_a1, dr_b1);
2896 std::swap (dr_a2, dr_b2);
2899 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2900 DR_BASE_ADDRESS (dr_a2->dr),
2902 || !tree_fits_shwi_p (dr_a1->offset)
2903 || !tree_fits_shwi_p (dr_a2->offset))
2904 continue;
2906 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2907 - tree_to_shwi (dr_a1->offset));
2910 /* Now we check if the following condition is satisfied:
2912 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2914 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2915 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2916 have to make a best estimation. We can get the minimum value
2917 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2918 then either of the following two conditions can guarantee the
2919 one above:
2921 1: DIFF <= MIN_SEG_LEN_B
2922 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2926 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2927 ? tree_to_shwi (dr_b1->seg_len)
2928 : vect_factor);
2930 if (diff <= min_seg_len_b
2931 || (tree_fits_shwi_p (dr_a1->seg_len)
2932 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2934 if (dump_enabled_p ())
2936 dump_printf_loc (MSG_NOTE, vect_location,
2937 "merging ranges for ");
2938 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2939 DR_REF (dr_a1->dr));
2940 dump_printf (MSG_NOTE, ", ");
2941 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2942 DR_REF (dr_b1->dr));
2943 dump_printf (MSG_NOTE, " and ");
2944 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2945 DR_REF (dr_a2->dr));
2946 dump_printf (MSG_NOTE, ", ");
2947 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2948 DR_REF (dr_b2->dr));
2949 dump_printf (MSG_NOTE, "\n");
2952 dr_a1->seg_len = size_binop (PLUS_EXPR,
2953 dr_a2->seg_len, size_int (diff));
2954 comp_alias_ddrs.ordered_remove (i--);
2959 dump_printf_loc (MSG_NOTE, vect_location,
2960 "improved number of alias checks from %d to %d\n",
2961 may_alias_ddrs.length (), comp_alias_ddrs.length ());
2962 if ((int) comp_alias_ddrs.length () >
2963 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2964 return false;
2966 return true;
2969 /* Check whether a non-affine read in stmt is suitable for gather load
2970 and if so, return a builtin decl for that operation. */
2972 tree
2973 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2974 tree *offp, int *scalep)
2976 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2977 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2978 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2979 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2980 tree offtype = NULL_TREE;
2981 tree decl, base, off;
2982 machine_mode pmode;
2983 int punsignedp, pvolatilep;
2985 base = DR_REF (dr);
2986 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2987 see if we can use the def stmt of the address. */
2988 if (is_gimple_call (stmt)
2989 && gimple_call_internal_p (stmt)
2990 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2991 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2992 && TREE_CODE (base) == MEM_REF
2993 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2994 && integer_zerop (TREE_OPERAND (base, 1))
2995 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2997 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2998 if (is_gimple_assign (def_stmt)
2999 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3000 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3003 /* The gather builtins need address of the form
3004 loop_invariant + vector * {1, 2, 4, 8}
3006 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3007 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3008 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3009 multiplications and additions in it. To get a vector, we need
3010 a single SSA_NAME that will be defined in the loop and will
3011 contain everything that is not loop invariant and that can be
3012 vectorized. The following code attempts to find such a preexistng
3013 SSA_NAME OFF and put the loop invariants into a tree BASE
3014 that can be gimplified before the loop. */
3015 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3016 &pmode, &punsignedp, &pvolatilep, false);
3017 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3019 if (TREE_CODE (base) == MEM_REF)
3021 if (!integer_zerop (TREE_OPERAND (base, 1)))
3023 if (off == NULL_TREE)
3025 offset_int moff = mem_ref_offset (base);
3026 off = wide_int_to_tree (sizetype, moff);
3028 else
3029 off = size_binop (PLUS_EXPR, off,
3030 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3032 base = TREE_OPERAND (base, 0);
3034 else
3035 base = build_fold_addr_expr (base);
3037 if (off == NULL_TREE)
3038 off = size_zero_node;
3040 /* If base is not loop invariant, either off is 0, then we start with just
3041 the constant offset in the loop invariant BASE and continue with base
3042 as OFF, otherwise give up.
3043 We could handle that case by gimplifying the addition of base + off
3044 into some SSA_NAME and use that as off, but for now punt. */
3045 if (!expr_invariant_in_loop_p (loop, base))
3047 if (!integer_zerop (off))
3048 return NULL_TREE;
3049 off = base;
3050 base = size_int (pbitpos / BITS_PER_UNIT);
3052 /* Otherwise put base + constant offset into the loop invariant BASE
3053 and continue with OFF. */
3054 else
3056 base = fold_convert (sizetype, base);
3057 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3060 /* OFF at this point may be either a SSA_NAME or some tree expression
3061 from get_inner_reference. Try to peel off loop invariants from it
3062 into BASE as long as possible. */
3063 STRIP_NOPS (off);
3064 while (offtype == NULL_TREE)
3066 enum tree_code code;
3067 tree op0, op1, add = NULL_TREE;
3069 if (TREE_CODE (off) == SSA_NAME)
3071 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3073 if (expr_invariant_in_loop_p (loop, off))
3074 return NULL_TREE;
3076 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3077 break;
3079 op0 = gimple_assign_rhs1 (def_stmt);
3080 code = gimple_assign_rhs_code (def_stmt);
3081 op1 = gimple_assign_rhs2 (def_stmt);
3083 else
3085 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3086 return NULL_TREE;
3087 code = TREE_CODE (off);
3088 extract_ops_from_tree (off, &code, &op0, &op1);
3090 switch (code)
3092 case POINTER_PLUS_EXPR:
3093 case PLUS_EXPR:
3094 if (expr_invariant_in_loop_p (loop, op0))
3096 add = op0;
3097 off = op1;
3098 do_add:
3099 add = fold_convert (sizetype, add);
3100 if (scale != 1)
3101 add = size_binop (MULT_EXPR, add, size_int (scale));
3102 base = size_binop (PLUS_EXPR, base, add);
3103 continue;
3105 if (expr_invariant_in_loop_p (loop, op1))
3107 add = op1;
3108 off = op0;
3109 goto do_add;
3111 break;
3112 case MINUS_EXPR:
3113 if (expr_invariant_in_loop_p (loop, op1))
3115 add = fold_convert (sizetype, op1);
3116 add = size_binop (MINUS_EXPR, size_zero_node, add);
3117 off = op0;
3118 goto do_add;
3120 break;
3121 case MULT_EXPR:
3122 if (scale == 1 && tree_fits_shwi_p (op1))
3124 scale = tree_to_shwi (op1);
3125 off = op0;
3126 continue;
3128 break;
3129 case SSA_NAME:
3130 off = op0;
3131 continue;
3132 CASE_CONVERT:
3133 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3134 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3135 break;
3136 if (TYPE_PRECISION (TREE_TYPE (op0))
3137 == TYPE_PRECISION (TREE_TYPE (off)))
3139 off = op0;
3140 continue;
3142 if (TYPE_PRECISION (TREE_TYPE (op0))
3143 < TYPE_PRECISION (TREE_TYPE (off)))
3145 off = op0;
3146 offtype = TREE_TYPE (off);
3147 STRIP_NOPS (off);
3148 continue;
3150 break;
3151 default:
3152 break;
3154 break;
3157 /* If at the end OFF still isn't a SSA_NAME or isn't
3158 defined in the loop, punt. */
3159 if (TREE_CODE (off) != SSA_NAME
3160 || expr_invariant_in_loop_p (loop, off))
3161 return NULL_TREE;
3163 if (offtype == NULL_TREE)
3164 offtype = TREE_TYPE (off);
3166 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3167 offtype, scale);
3168 if (decl == NULL_TREE)
3169 return NULL_TREE;
3171 if (basep)
3172 *basep = base;
3173 if (offp)
3174 *offp = off;
3175 if (scalep)
3176 *scalep = scale;
3177 return decl;
3180 /* Function vect_analyze_data_refs.
3182 Find all the data references in the loop or basic block.
3184 The general structure of the analysis of data refs in the vectorizer is as
3185 follows:
3186 1- vect_analyze_data_refs(loop/bb): call
3187 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3188 in the loop/bb and their dependences.
3189 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3190 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3191 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3195 bool
3196 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3197 bb_vec_info bb_vinfo,
3198 int *min_vf, unsigned *n_stmts)
3200 struct loop *loop = NULL;
3201 basic_block bb = NULL;
3202 unsigned int i;
3203 vec<data_reference_p> datarefs;
3204 struct data_reference *dr;
3205 tree scalar_type;
3207 if (dump_enabled_p ())
3208 dump_printf_loc (MSG_NOTE, vect_location,
3209 "=== vect_analyze_data_refs ===\n");
3211 if (loop_vinfo)
3213 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3215 loop = LOOP_VINFO_LOOP (loop_vinfo);
3216 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3217 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3219 if (dump_enabled_p ())
3220 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3221 "not vectorized: loop contains function calls"
3222 " or data references that cannot be analyzed\n");
3223 return false;
3226 for (i = 0; i < loop->num_nodes; i++)
3228 gimple_stmt_iterator gsi;
3230 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3232 gimple stmt = gsi_stmt (gsi);
3233 if (is_gimple_debug (stmt))
3234 continue;
3235 ++*n_stmts;
3236 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3238 if (is_gimple_call (stmt) && loop->safelen)
3240 tree fndecl = gimple_call_fndecl (stmt), op;
3241 if (fndecl != NULL_TREE)
3243 struct cgraph_node *node = cgraph_node::get (fndecl);
3244 if (node != NULL && node->simd_clones != NULL)
3246 unsigned int j, n = gimple_call_num_args (stmt);
3247 for (j = 0; j < n; j++)
3249 op = gimple_call_arg (stmt, j);
3250 if (DECL_P (op)
3251 || (REFERENCE_CLASS_P (op)
3252 && get_base_address (op)))
3253 break;
3255 op = gimple_call_lhs (stmt);
3256 /* Ignore #pragma omp declare simd functions
3257 if they don't have data references in the
3258 call stmt itself. */
3259 if (j == n
3260 && !(op
3261 && (DECL_P (op)
3262 || (REFERENCE_CLASS_P (op)
3263 && get_base_address (op)))))
3264 continue;
3268 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3269 if (dump_enabled_p ())
3270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3271 "not vectorized: loop contains function "
3272 "calls or data references that cannot "
3273 "be analyzed\n");
3274 return false;
3279 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3281 else
3283 gimple_stmt_iterator gsi;
3285 bb = BB_VINFO_BB (bb_vinfo);
3286 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3288 gimple stmt = gsi_stmt (gsi);
3289 if (is_gimple_debug (stmt))
3290 continue;
3291 ++*n_stmts;
3292 if (!find_data_references_in_stmt (NULL, stmt,
3293 &BB_VINFO_DATAREFS (bb_vinfo)))
3295 /* Mark the rest of the basic-block as unvectorizable. */
3296 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3298 stmt = gsi_stmt (gsi);
3299 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3301 break;
3305 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3308 /* Go through the data-refs, check that the analysis succeeded. Update
3309 pointer from stmt_vec_info struct to DR and vectype. */
3311 FOR_EACH_VEC_ELT (datarefs, i, dr)
3313 gimple stmt;
3314 stmt_vec_info stmt_info;
3315 tree base, offset, init;
3316 bool gather = false;
3317 bool simd_lane_access = false;
3318 int vf;
3320 again:
3321 if (!dr || !DR_REF (dr))
3323 if (dump_enabled_p ())
3324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3325 "not vectorized: unhandled data-ref\n");
3326 return false;
3329 stmt = DR_STMT (dr);
3330 stmt_info = vinfo_for_stmt (stmt);
3332 /* Discard clobbers from the dataref vector. We will remove
3333 clobber stmts during vectorization. */
3334 if (gimple_clobber_p (stmt))
3336 free_data_ref (dr);
3337 if (i == datarefs.length () - 1)
3339 datarefs.pop ();
3340 break;
3342 datarefs.ordered_remove (i);
3343 dr = datarefs[i];
3344 goto again;
3347 /* Check that analysis of the data-ref succeeded. */
3348 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3349 || !DR_STEP (dr))
3351 bool maybe_gather
3352 = DR_IS_READ (dr)
3353 && !TREE_THIS_VOLATILE (DR_REF (dr))
3354 && targetm.vectorize.builtin_gather != NULL;
3355 bool maybe_simd_lane_access
3356 = loop_vinfo && loop->simduid;
3358 /* If target supports vector gather loads, or if this might be
3359 a SIMD lane access, see if they can't be used. */
3360 if (loop_vinfo
3361 && (maybe_gather || maybe_simd_lane_access)
3362 && !nested_in_vect_loop_p (loop, stmt))
3364 struct data_reference *newdr
3365 = create_data_ref (NULL, loop_containing_stmt (stmt),
3366 DR_REF (dr), stmt, true);
3367 gcc_assert (newdr != NULL && DR_REF (newdr));
3368 if (DR_BASE_ADDRESS (newdr)
3369 && DR_OFFSET (newdr)
3370 && DR_INIT (newdr)
3371 && DR_STEP (newdr)
3372 && integer_zerop (DR_STEP (newdr)))
3374 if (maybe_simd_lane_access)
3376 tree off = DR_OFFSET (newdr);
3377 STRIP_NOPS (off);
3378 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3379 && TREE_CODE (off) == MULT_EXPR
3380 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3382 tree step = TREE_OPERAND (off, 1);
3383 off = TREE_OPERAND (off, 0);
3384 STRIP_NOPS (off);
3385 if (CONVERT_EXPR_P (off)
3386 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3387 0)))
3388 < TYPE_PRECISION (TREE_TYPE (off)))
3389 off = TREE_OPERAND (off, 0);
3390 if (TREE_CODE (off) == SSA_NAME)
3392 gimple def = SSA_NAME_DEF_STMT (off);
3393 tree reft = TREE_TYPE (DR_REF (newdr));
3394 if (is_gimple_call (def)
3395 && gimple_call_internal_p (def)
3396 && (gimple_call_internal_fn (def)
3397 == IFN_GOMP_SIMD_LANE))
3399 tree arg = gimple_call_arg (def, 0);
3400 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3401 arg = SSA_NAME_VAR (arg);
3402 if (arg == loop->simduid
3403 /* For now. */
3404 && tree_int_cst_equal
3405 (TYPE_SIZE_UNIT (reft),
3406 step))
3408 DR_OFFSET (newdr) = ssize_int (0);
3409 DR_STEP (newdr) = step;
3410 DR_ALIGNED_TO (newdr)
3411 = size_int (BIGGEST_ALIGNMENT);
3412 dr = newdr;
3413 simd_lane_access = true;
3419 if (!simd_lane_access && maybe_gather)
3421 dr = newdr;
3422 gather = true;
3425 if (!gather && !simd_lane_access)
3426 free_data_ref (newdr);
3429 if (!gather && !simd_lane_access)
3431 if (dump_enabled_p ())
3433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3434 "not vectorized: data ref analysis "
3435 "failed ");
3436 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3437 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3440 if (bb_vinfo)
3441 break;
3443 return false;
3447 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3449 if (dump_enabled_p ())
3450 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3451 "not vectorized: base addr of dr is a "
3452 "constant\n");
3454 if (bb_vinfo)
3455 break;
3457 if (gather || simd_lane_access)
3458 free_data_ref (dr);
3459 return false;
3462 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3464 if (dump_enabled_p ())
3466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3467 "not vectorized: volatile type ");
3468 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3469 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3472 if (bb_vinfo)
3473 break;
3475 return false;
3478 if (stmt_can_throw_internal (stmt))
3480 if (dump_enabled_p ())
3482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3483 "not vectorized: statement can throw an "
3484 "exception ");
3485 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3486 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3489 if (bb_vinfo)
3490 break;
3492 if (gather || simd_lane_access)
3493 free_data_ref (dr);
3494 return false;
3497 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3498 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3500 if (dump_enabled_p ())
3502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3503 "not vectorized: statement is bitfield "
3504 "access ");
3505 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3506 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3509 if (bb_vinfo)
3510 break;
3512 if (gather || simd_lane_access)
3513 free_data_ref (dr);
3514 return false;
3517 base = unshare_expr (DR_BASE_ADDRESS (dr));
3518 offset = unshare_expr (DR_OFFSET (dr));
3519 init = unshare_expr (DR_INIT (dr));
3521 if (is_gimple_call (stmt)
3522 && (!gimple_call_internal_p (stmt)
3523 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3524 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3526 if (dump_enabled_p ())
3528 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3529 "not vectorized: dr in a call ");
3530 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3531 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3534 if (bb_vinfo)
3535 break;
3537 if (gather || simd_lane_access)
3538 free_data_ref (dr);
3539 return false;
3542 /* Update DR field in stmt_vec_info struct. */
3544 /* If the dataref is in an inner-loop of the loop that is considered for
3545 for vectorization, we also want to analyze the access relative to
3546 the outer-loop (DR contains information only relative to the
3547 inner-most enclosing loop). We do that by building a reference to the
3548 first location accessed by the inner-loop, and analyze it relative to
3549 the outer-loop. */
3550 if (loop && nested_in_vect_loop_p (loop, stmt))
3552 tree outer_step, outer_base, outer_init;
3553 HOST_WIDE_INT pbitsize, pbitpos;
3554 tree poffset;
3555 machine_mode pmode;
3556 int punsignedp, pvolatilep;
3557 affine_iv base_iv, offset_iv;
3558 tree dinit;
3560 /* Build a reference to the first location accessed by the
3561 inner-loop: *(BASE+INIT). (The first location is actually
3562 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3563 tree inner_base = build_fold_indirect_ref
3564 (fold_build_pointer_plus (base, init));
3566 if (dump_enabled_p ())
3568 dump_printf_loc (MSG_NOTE, vect_location,
3569 "analyze in outer-loop: ");
3570 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3571 dump_printf (MSG_NOTE, "\n");
3574 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3575 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3576 gcc_assert (outer_base != NULL_TREE);
3578 if (pbitpos % BITS_PER_UNIT != 0)
3580 if (dump_enabled_p ())
3581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3582 "failed: bit offset alignment.\n");
3583 return false;
3586 outer_base = build_fold_addr_expr (outer_base);
3587 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3588 &base_iv, false))
3590 if (dump_enabled_p ())
3591 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3592 "failed: evolution of base is not affine.\n");
3593 return false;
3596 if (offset)
3598 if (poffset)
3599 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3600 poffset);
3601 else
3602 poffset = offset;
3605 if (!poffset)
3607 offset_iv.base = ssize_int (0);
3608 offset_iv.step = ssize_int (0);
3610 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3611 &offset_iv, false))
3613 if (dump_enabled_p ())
3614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3615 "evolution of offset is not affine.\n");
3616 return false;
3619 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3620 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3621 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3622 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3623 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3625 outer_step = size_binop (PLUS_EXPR,
3626 fold_convert (ssizetype, base_iv.step),
3627 fold_convert (ssizetype, offset_iv.step));
3629 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3630 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3631 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3632 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3633 STMT_VINFO_DR_OFFSET (stmt_info) =
3634 fold_convert (ssizetype, offset_iv.base);
3635 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3636 size_int (highest_pow2_factor (offset_iv.base));
3638 if (dump_enabled_p ())
3640 dump_printf_loc (MSG_NOTE, vect_location,
3641 "\touter base_address: ");
3642 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3643 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3644 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3645 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3646 STMT_VINFO_DR_OFFSET (stmt_info));
3647 dump_printf (MSG_NOTE,
3648 "\n\touter constant offset from base address: ");
3649 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3650 STMT_VINFO_DR_INIT (stmt_info));
3651 dump_printf (MSG_NOTE, "\n\touter step: ");
3652 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3653 STMT_VINFO_DR_STEP (stmt_info));
3654 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3655 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3656 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3657 dump_printf (MSG_NOTE, "\n");
3661 if (STMT_VINFO_DATA_REF (stmt_info))
3663 if (dump_enabled_p ())
3665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3666 "not vectorized: more than one data ref "
3667 "in stmt: ");
3668 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3669 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3672 if (bb_vinfo)
3673 break;
3675 if (gather || simd_lane_access)
3676 free_data_ref (dr);
3677 return false;
3680 STMT_VINFO_DATA_REF (stmt_info) = dr;
3681 if (simd_lane_access)
3683 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3684 free_data_ref (datarefs[i]);
3685 datarefs[i] = dr;
3688 /* Set vectype for STMT. */
3689 scalar_type = TREE_TYPE (DR_REF (dr));
3690 STMT_VINFO_VECTYPE (stmt_info)
3691 = get_vectype_for_scalar_type (scalar_type);
3692 if (!STMT_VINFO_VECTYPE (stmt_info))
3694 if (dump_enabled_p ())
3696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3697 "not vectorized: no vectype for stmt: ");
3698 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3699 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3700 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3701 scalar_type);
3702 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3705 if (bb_vinfo)
3706 break;
3708 if (gather || simd_lane_access)
3710 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3711 if (gather)
3712 free_data_ref (dr);
3714 return false;
3716 else
3718 if (dump_enabled_p ())
3720 dump_printf_loc (MSG_NOTE, vect_location,
3721 "got vectype for stmt: ");
3722 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3723 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3724 STMT_VINFO_VECTYPE (stmt_info));
3725 dump_printf (MSG_NOTE, "\n");
3729 /* Adjust the minimal vectorization factor according to the
3730 vector type. */
3731 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3732 if (vf > *min_vf)
3733 *min_vf = vf;
3735 if (gather)
3737 tree off;
3739 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3740 if (gather
3741 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3742 gather = false;
3743 if (!gather)
3745 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3746 free_data_ref (dr);
3747 if (dump_enabled_p ())
3749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3750 "not vectorized: not suitable for gather "
3751 "load ");
3752 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3753 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3755 return false;
3758 datarefs[i] = dr;
3759 STMT_VINFO_GATHER_P (stmt_info) = true;
3761 else if (loop_vinfo
3762 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3764 if (nested_in_vect_loop_p (loop, stmt))
3766 if (dump_enabled_p ())
3768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3769 "not vectorized: not suitable for strided "
3770 "load ");
3771 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3772 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3774 return false;
3776 STMT_VINFO_STRIDED_P (stmt_info) = true;
3780 /* If we stopped analysis at the first dataref we could not analyze
3781 when trying to vectorize a basic-block mark the rest of the datarefs
3782 as not vectorizable and truncate the vector of datarefs. That
3783 avoids spending useless time in analyzing their dependence. */
3784 if (i != datarefs.length ())
3786 gcc_assert (bb_vinfo != NULL);
3787 for (unsigned j = i; j < datarefs.length (); ++j)
3789 data_reference_p dr = datarefs[j];
3790 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3791 free_data_ref (dr);
3793 datarefs.truncate (i);
3796 return true;
3800 /* Function vect_get_new_vect_var.
3802 Returns a name for a new variable. The current naming scheme appends the
3803 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3804 the name of vectorizer generated variables, and appends that to NAME if
3805 provided. */
3807 tree
3808 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3810 const char *prefix;
3811 tree new_vect_var;
3813 switch (var_kind)
3815 case vect_simple_var:
3816 prefix = "vect";
3817 break;
3818 case vect_scalar_var:
3819 prefix = "stmp";
3820 break;
3821 case vect_pointer_var:
3822 prefix = "vectp";
3823 break;
3824 default:
3825 gcc_unreachable ();
3828 if (name)
3830 char* tmp = concat (prefix, "_", name, NULL);
3831 new_vect_var = create_tmp_reg (type, tmp);
3832 free (tmp);
3834 else
3835 new_vect_var = create_tmp_reg (type, prefix);
3837 return new_vect_var;
3840 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3842 static void
3843 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3844 stmt_vec_info stmt_info)
3846 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3847 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3848 int misalign = DR_MISALIGNMENT (dr);
3849 if (misalign == -1)
3850 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3851 else
3852 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3855 /* Function vect_create_addr_base_for_vector_ref.
3857 Create an expression that computes the address of the first memory location
3858 that will be accessed for a data reference.
3860 Input:
3861 STMT: The statement containing the data reference.
3862 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3863 OFFSET: Optional. If supplied, it is be added to the initial address.
3864 LOOP: Specify relative to which loop-nest should the address be computed.
3865 For example, when the dataref is in an inner-loop nested in an
3866 outer-loop that is now being vectorized, LOOP can be either the
3867 outer-loop, or the inner-loop. The first memory location accessed
3868 by the following dataref ('in' points to short):
3870 for (i=0; i<N; i++)
3871 for (j=0; j<M; j++)
3872 s += in[i+j]
3874 is as follows:
3875 if LOOP=i_loop: &in (relative to i_loop)
3876 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3877 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3878 initial address. Unlike OFFSET, which is number of elements to
3879 be added, BYTE_OFFSET is measured in bytes.
3881 Output:
3882 1. Return an SSA_NAME whose value is the address of the memory location of
3883 the first vector of the data reference.
3884 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3885 these statement(s) which define the returned SSA_NAME.
3887 FORNOW: We are only handling array accesses with step 1. */
3889 tree
3890 vect_create_addr_base_for_vector_ref (gimple stmt,
3891 gimple_seq *new_stmt_list,
3892 tree offset,
3893 struct loop *loop,
3894 tree byte_offset)
3896 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3897 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3898 tree data_ref_base;
3899 const char *base_name;
3900 tree addr_base;
3901 tree dest;
3902 gimple_seq seq = NULL;
3903 tree base_offset;
3904 tree init;
3905 tree vect_ptr_type;
3906 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3907 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3909 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3911 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3913 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3915 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3916 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3917 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3919 else
3921 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3922 base_offset = unshare_expr (DR_OFFSET (dr));
3923 init = unshare_expr (DR_INIT (dr));
3926 if (loop_vinfo)
3927 base_name = get_name (data_ref_base);
3928 else
3930 base_offset = ssize_int (0);
3931 init = ssize_int (0);
3932 base_name = get_name (DR_REF (dr));
3935 /* Create base_offset */
3936 base_offset = size_binop (PLUS_EXPR,
3937 fold_convert (sizetype, base_offset),
3938 fold_convert (sizetype, init));
3940 if (offset)
3942 offset = fold_build2 (MULT_EXPR, sizetype,
3943 fold_convert (sizetype, offset), step);
3944 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3945 base_offset, offset);
3947 if (byte_offset)
3949 byte_offset = fold_convert (sizetype, byte_offset);
3950 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3951 base_offset, byte_offset);
3954 /* base + base_offset */
3955 if (loop_vinfo)
3956 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3957 else
3959 addr_base = build1 (ADDR_EXPR,
3960 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3961 unshare_expr (DR_REF (dr)));
3964 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3965 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3966 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
3967 gimple_seq_add_seq (new_stmt_list, seq);
3969 if (DR_PTR_INFO (dr)
3970 && TREE_CODE (addr_base) == SSA_NAME
3971 && !SSA_NAME_PTR_INFO (addr_base))
3973 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3974 if (offset || byte_offset)
3975 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3978 if (dump_enabled_p ())
3980 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3981 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3982 dump_printf (MSG_NOTE, "\n");
3985 return addr_base;
3989 /* Function vect_create_data_ref_ptr.
3991 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3992 location accessed in the loop by STMT, along with the def-use update
3993 chain to appropriately advance the pointer through the loop iterations.
3994 Also set aliasing information for the pointer. This pointer is used by
3995 the callers to this function to create a memory reference expression for
3996 vector load/store access.
3998 Input:
3999 1. STMT: a stmt that references memory. Expected to be of the form
4000 GIMPLE_ASSIGN <name, data-ref> or
4001 GIMPLE_ASSIGN <data-ref, name>.
4002 2. AGGR_TYPE: the type of the reference, which should be either a vector
4003 or an array.
4004 3. AT_LOOP: the loop where the vector memref is to be created.
4005 4. OFFSET (optional): an offset to be added to the initial address accessed
4006 by the data-ref in STMT.
4007 5. BSI: location where the new stmts are to be placed if there is no loop
4008 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4009 pointing to the initial address.
4010 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4011 to the initial address accessed by the data-ref in STMT. This is
4012 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4013 in bytes.
4015 Output:
4016 1. Declare a new ptr to vector_type, and have it point to the base of the
4017 data reference (initial addressed accessed by the data reference).
4018 For example, for vector of type V8HI, the following code is generated:
4020 v8hi *ap;
4021 ap = (v8hi *)initial_address;
4023 if OFFSET is not supplied:
4024 initial_address = &a[init];
4025 if OFFSET is supplied:
4026 initial_address = &a[init + OFFSET];
4027 if BYTE_OFFSET is supplied:
4028 initial_address = &a[init] + BYTE_OFFSET;
4030 Return the initial_address in INITIAL_ADDRESS.
4032 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4033 update the pointer in each iteration of the loop.
4035 Return the increment stmt that updates the pointer in PTR_INCR.
4037 3. Set INV_P to true if the access pattern of the data reference in the
4038 vectorized loop is invariant. Set it to false otherwise.
4040 4. Return the pointer. */
4042 tree
4043 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4044 tree offset, tree *initial_address,
4045 gimple_stmt_iterator *gsi, gimple *ptr_incr,
4046 bool only_init, bool *inv_p, tree byte_offset)
4048 const char *base_name;
4049 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4050 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4051 struct loop *loop = NULL;
4052 bool nested_in_vect_loop = false;
4053 struct loop *containing_loop = NULL;
4054 tree aggr_ptr_type;
4055 tree aggr_ptr;
4056 tree new_temp;
4057 gimple_seq new_stmt_list = NULL;
4058 edge pe = NULL;
4059 basic_block new_bb;
4060 tree aggr_ptr_init;
4061 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4062 tree aptr;
4063 gimple_stmt_iterator incr_gsi;
4064 bool insert_after;
4065 tree indx_before_incr, indx_after_incr;
4066 gimple incr;
4067 tree step;
4068 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4070 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4071 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4073 if (loop_vinfo)
4075 loop = LOOP_VINFO_LOOP (loop_vinfo);
4076 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4077 containing_loop = (gimple_bb (stmt))->loop_father;
4078 pe = loop_preheader_edge (loop);
4080 else
4082 gcc_assert (bb_vinfo);
4083 only_init = true;
4084 *ptr_incr = NULL;
4087 /* Check the step (evolution) of the load in LOOP, and record
4088 whether it's invariant. */
4089 if (nested_in_vect_loop)
4090 step = STMT_VINFO_DR_STEP (stmt_info);
4091 else
4092 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4094 if (integer_zerop (step))
4095 *inv_p = true;
4096 else
4097 *inv_p = false;
4099 /* Create an expression for the first address accessed by this load
4100 in LOOP. */
4101 base_name = get_name (DR_BASE_ADDRESS (dr));
4103 if (dump_enabled_p ())
4105 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4106 dump_printf_loc (MSG_NOTE, vect_location,
4107 "create %s-pointer variable to type: ",
4108 get_tree_code_name (TREE_CODE (aggr_type)));
4109 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4110 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4111 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4112 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4113 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4114 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4115 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4116 else
4117 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4118 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4119 dump_printf (MSG_NOTE, "\n");
4122 /* (1) Create the new aggregate-pointer variable.
4123 Vector and array types inherit the alias set of their component
4124 type by default so we need to use a ref-all pointer if the data
4125 reference does not conflict with the created aggregated data
4126 reference because it is not addressable. */
4127 bool need_ref_all = false;
4128 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4129 get_alias_set (DR_REF (dr))))
4130 need_ref_all = true;
4131 /* Likewise for any of the data references in the stmt group. */
4132 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4134 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4137 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4138 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4139 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4140 get_alias_set (DR_REF (sdr))))
4142 need_ref_all = true;
4143 break;
4145 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4147 while (orig_stmt);
4149 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4150 need_ref_all);
4151 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4154 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4155 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4156 def-use update cycles for the pointer: one relative to the outer-loop
4157 (LOOP), which is what steps (3) and (4) below do. The other is relative
4158 to the inner-loop (which is the inner-most loop containing the dataref),
4159 and this is done be step (5) below.
4161 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4162 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4163 redundant. Steps (3),(4) create the following:
4165 vp0 = &base_addr;
4166 LOOP: vp1 = phi(vp0,vp2)
4169 vp2 = vp1 + step
4170 goto LOOP
4172 If there is an inner-loop nested in loop, then step (5) will also be
4173 applied, and an additional update in the inner-loop will be created:
4175 vp0 = &base_addr;
4176 LOOP: vp1 = phi(vp0,vp2)
4178 inner: vp3 = phi(vp1,vp4)
4179 vp4 = vp3 + inner_step
4180 if () goto inner
4182 vp2 = vp1 + step
4183 if () goto LOOP */
4185 /* (2) Calculate the initial address of the aggregate-pointer, and set
4186 the aggregate-pointer to point to it before the loop. */
4188 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4190 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4191 offset, loop, byte_offset);
4192 if (new_stmt_list)
4194 if (pe)
4196 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4197 gcc_assert (!new_bb);
4199 else
4200 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4203 *initial_address = new_temp;
4204 aggr_ptr_init = new_temp;
4206 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4207 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4208 inner-loop nested in LOOP (during outer-loop vectorization). */
4210 /* No update in loop is required. */
4211 if (only_init && (!loop_vinfo || at_loop == loop))
4212 aptr = aggr_ptr_init;
4213 else
4215 /* The step of the aggregate pointer is the type size. */
4216 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4217 /* One exception to the above is when the scalar step of the load in
4218 LOOP is zero. In this case the step here is also zero. */
4219 if (*inv_p)
4220 iv_step = size_zero_node;
4221 else if (tree_int_cst_sgn (step) == -1)
4222 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4224 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4226 create_iv (aggr_ptr_init,
4227 fold_convert (aggr_ptr_type, iv_step),
4228 aggr_ptr, loop, &incr_gsi, insert_after,
4229 &indx_before_incr, &indx_after_incr);
4230 incr = gsi_stmt (incr_gsi);
4231 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4233 /* Copy the points-to information if it exists. */
4234 if (DR_PTR_INFO (dr))
4236 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4237 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4239 if (ptr_incr)
4240 *ptr_incr = incr;
4242 aptr = indx_before_incr;
4245 if (!nested_in_vect_loop || only_init)
4246 return aptr;
4249 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4250 nested in LOOP, if exists. */
4252 gcc_assert (nested_in_vect_loop);
4253 if (!only_init)
4255 standard_iv_increment_position (containing_loop, &incr_gsi,
4256 &insert_after);
4257 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4258 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4259 &indx_after_incr);
4260 incr = gsi_stmt (incr_gsi);
4261 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4263 /* Copy the points-to information if it exists. */
4264 if (DR_PTR_INFO (dr))
4266 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4267 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4269 if (ptr_incr)
4270 *ptr_incr = incr;
4272 return indx_before_incr;
4274 else
4275 gcc_unreachable ();
4279 /* Function bump_vector_ptr
4281 Increment a pointer (to a vector type) by vector-size. If requested,
4282 i.e. if PTR-INCR is given, then also connect the new increment stmt
4283 to the existing def-use update-chain of the pointer, by modifying
4284 the PTR_INCR as illustrated below:
4286 The pointer def-use update-chain before this function:
4287 DATAREF_PTR = phi (p_0, p_2)
4288 ....
4289 PTR_INCR: p_2 = DATAREF_PTR + step
4291 The pointer def-use update-chain after this function:
4292 DATAREF_PTR = phi (p_0, p_2)
4293 ....
4294 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4295 ....
4296 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4298 Input:
4299 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4300 in the loop.
4301 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4302 the loop. The increment amount across iterations is expected
4303 to be vector_size.
4304 BSI - location where the new update stmt is to be placed.
4305 STMT - the original scalar memory-access stmt that is being vectorized.
4306 BUMP - optional. The offset by which to bump the pointer. If not given,
4307 the offset is assumed to be vector_size.
4309 Output: Return NEW_DATAREF_PTR as illustrated above.
4313 tree
4314 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4315 gimple stmt, tree bump)
4317 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4318 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4319 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4320 tree update = TYPE_SIZE_UNIT (vectype);
4321 gassign *incr_stmt;
4322 ssa_op_iter iter;
4323 use_operand_p use_p;
4324 tree new_dataref_ptr;
4326 if (bump)
4327 update = bump;
4329 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4330 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4331 else
4332 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4333 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4334 dataref_ptr, update);
4335 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4337 /* Copy the points-to information if it exists. */
4338 if (DR_PTR_INFO (dr))
4340 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4341 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4344 if (!ptr_incr)
4345 return new_dataref_ptr;
4347 /* Update the vector-pointer's cross-iteration increment. */
4348 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4350 tree use = USE_FROM_PTR (use_p);
4352 if (use == dataref_ptr)
4353 SET_USE (use_p, new_dataref_ptr);
4354 else
4355 gcc_assert (tree_int_cst_compare (use, update) == 0);
4358 return new_dataref_ptr;
4362 /* Function vect_create_destination_var.
4364 Create a new temporary of type VECTYPE. */
4366 tree
4367 vect_create_destination_var (tree scalar_dest, tree vectype)
4369 tree vec_dest;
4370 const char *name;
4371 char *new_name;
4372 tree type;
4373 enum vect_var_kind kind;
4375 kind = vectype ? vect_simple_var : vect_scalar_var;
4376 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4378 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4380 name = get_name (scalar_dest);
4381 if (name)
4382 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4383 else
4384 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4385 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4386 free (new_name);
4388 return vec_dest;
4391 /* Function vect_grouped_store_supported.
4393 Returns TRUE if interleave high and interleave low permutations
4394 are supported, and FALSE otherwise. */
4396 bool
4397 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4399 machine_mode mode = TYPE_MODE (vectype);
4401 /* vect_permute_store_chain requires the group size to be equal to 3 or
4402 be a power of two. */
4403 if (count != 3 && exact_log2 (count) == -1)
4405 if (dump_enabled_p ())
4406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4407 "the size of the group of accesses"
4408 " is not a power of 2 or not eqaul to 3\n");
4409 return false;
4412 /* Check that the permutation is supported. */
4413 if (VECTOR_MODE_P (mode))
4415 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4416 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4418 if (count == 3)
4420 unsigned int j0 = 0, j1 = 0, j2 = 0;
4421 unsigned int i, j;
4423 for (j = 0; j < 3; j++)
4425 int nelt0 = ((3 - j) * nelt) % 3;
4426 int nelt1 = ((3 - j) * nelt + 1) % 3;
4427 int nelt2 = ((3 - j) * nelt + 2) % 3;
4428 for (i = 0; i < nelt; i++)
4430 if (3 * i + nelt0 < nelt)
4431 sel[3 * i + nelt0] = j0++;
4432 if (3 * i + nelt1 < nelt)
4433 sel[3 * i + nelt1] = nelt + j1++;
4434 if (3 * i + nelt2 < nelt)
4435 sel[3 * i + nelt2] = 0;
4437 if (!can_vec_perm_p (mode, false, sel))
4439 if (dump_enabled_p ())
4440 dump_printf (MSG_MISSED_OPTIMIZATION,
4441 "permutaion op not supported by target.\n");
4442 return false;
4445 for (i = 0; i < nelt; i++)
4447 if (3 * i + nelt0 < nelt)
4448 sel[3 * i + nelt0] = 3 * i + nelt0;
4449 if (3 * i + nelt1 < nelt)
4450 sel[3 * i + nelt1] = 3 * i + nelt1;
4451 if (3 * i + nelt2 < nelt)
4452 sel[3 * i + nelt2] = nelt + j2++;
4454 if (!can_vec_perm_p (mode, false, sel))
4456 if (dump_enabled_p ())
4457 dump_printf (MSG_MISSED_OPTIMIZATION,
4458 "permutaion op not supported by target.\n");
4459 return false;
4462 return true;
4464 else
4466 /* If length is not equal to 3 then only power of 2 is supported. */
4467 gcc_assert (exact_log2 (count) != -1);
4469 for (i = 0; i < nelt / 2; i++)
4471 sel[i * 2] = i;
4472 sel[i * 2 + 1] = i + nelt;
4474 if (can_vec_perm_p (mode, false, sel))
4476 for (i = 0; i < nelt; i++)
4477 sel[i] += nelt / 2;
4478 if (can_vec_perm_p (mode, false, sel))
4479 return true;
4484 if (dump_enabled_p ())
4485 dump_printf (MSG_MISSED_OPTIMIZATION,
4486 "permutaion op not supported by target.\n");
4487 return false;
4491 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4492 type VECTYPE. */
4494 bool
4495 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4497 return vect_lanes_optab_supported_p ("vec_store_lanes",
4498 vec_store_lanes_optab,
4499 vectype, count);
4503 /* Function vect_permute_store_chain.
4505 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4506 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4507 the data correctly for the stores. Return the final references for stores
4508 in RESULT_CHAIN.
4510 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4511 The input is 4 vectors each containing 8 elements. We assign a number to
4512 each element, the input sequence is:
4514 1st vec: 0 1 2 3 4 5 6 7
4515 2nd vec: 8 9 10 11 12 13 14 15
4516 3rd vec: 16 17 18 19 20 21 22 23
4517 4th vec: 24 25 26 27 28 29 30 31
4519 The output sequence should be:
4521 1st vec: 0 8 16 24 1 9 17 25
4522 2nd vec: 2 10 18 26 3 11 19 27
4523 3rd vec: 4 12 20 28 5 13 21 30
4524 4th vec: 6 14 22 30 7 15 23 31
4526 i.e., we interleave the contents of the four vectors in their order.
4528 We use interleave_high/low instructions to create such output. The input of
4529 each interleave_high/low operation is two vectors:
4530 1st vec 2nd vec
4531 0 1 2 3 4 5 6 7
4532 the even elements of the result vector are obtained left-to-right from the
4533 high/low elements of the first vector. The odd elements of the result are
4534 obtained left-to-right from the high/low elements of the second vector.
4535 The output of interleave_high will be: 0 4 1 5
4536 and of interleave_low: 2 6 3 7
4539 The permutation is done in log LENGTH stages. In each stage interleave_high
4540 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4541 where the first argument is taken from the first half of DR_CHAIN and the
4542 second argument from it's second half.
4543 In our example,
4545 I1: interleave_high (1st vec, 3rd vec)
4546 I2: interleave_low (1st vec, 3rd vec)
4547 I3: interleave_high (2nd vec, 4th vec)
4548 I4: interleave_low (2nd vec, 4th vec)
4550 The output for the first stage is:
4552 I1: 0 16 1 17 2 18 3 19
4553 I2: 4 20 5 21 6 22 7 23
4554 I3: 8 24 9 25 10 26 11 27
4555 I4: 12 28 13 29 14 30 15 31
4557 The output of the second stage, i.e. the final result is:
4559 I1: 0 8 16 24 1 9 17 25
4560 I2: 2 10 18 26 3 11 19 27
4561 I3: 4 12 20 28 5 13 21 30
4562 I4: 6 14 22 30 7 15 23 31. */
4564 void
4565 vect_permute_store_chain (vec<tree> dr_chain,
4566 unsigned int length,
4567 gimple stmt,
4568 gimple_stmt_iterator *gsi,
4569 vec<tree> *result_chain)
4571 tree vect1, vect2, high, low;
4572 gimple perm_stmt;
4573 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4574 tree perm_mask_low, perm_mask_high;
4575 tree data_ref;
4576 tree perm3_mask_low, perm3_mask_high;
4577 unsigned int i, n, log_length = exact_log2 (length);
4578 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4579 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4581 result_chain->quick_grow (length);
4582 memcpy (result_chain->address (), dr_chain.address (),
4583 length * sizeof (tree));
4585 if (length == 3)
4587 unsigned int j0 = 0, j1 = 0, j2 = 0;
4589 for (j = 0; j < 3; j++)
4591 int nelt0 = ((3 - j) * nelt) % 3;
4592 int nelt1 = ((3 - j) * nelt + 1) % 3;
4593 int nelt2 = ((3 - j) * nelt + 2) % 3;
4595 for (i = 0; i < nelt; i++)
4597 if (3 * i + nelt0 < nelt)
4598 sel[3 * i + nelt0] = j0++;
4599 if (3 * i + nelt1 < nelt)
4600 sel[3 * i + nelt1] = nelt + j1++;
4601 if (3 * i + nelt2 < nelt)
4602 sel[3 * i + nelt2] = 0;
4604 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4606 for (i = 0; i < nelt; i++)
4608 if (3 * i + nelt0 < nelt)
4609 sel[3 * i + nelt0] = 3 * i + nelt0;
4610 if (3 * i + nelt1 < nelt)
4611 sel[3 * i + nelt1] = 3 * i + nelt1;
4612 if (3 * i + nelt2 < nelt)
4613 sel[3 * i + nelt2] = nelt + j2++;
4615 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4617 vect1 = dr_chain[0];
4618 vect2 = dr_chain[1];
4620 /* Create interleaving stmt:
4621 low = VEC_PERM_EXPR <vect1, vect2,
4622 {j, nelt, *, j + 1, nelt + j + 1, *,
4623 j + 2, nelt + j + 2, *, ...}> */
4624 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4625 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4626 vect2, perm3_mask_low);
4627 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4629 vect1 = data_ref;
4630 vect2 = dr_chain[2];
4631 /* Create interleaving stmt:
4632 low = VEC_PERM_EXPR <vect1, vect2,
4633 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4634 6, 7, nelt + j + 2, ...}> */
4635 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4636 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4637 vect2, perm3_mask_high);
4638 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4639 (*result_chain)[j] = data_ref;
4642 else
4644 /* If length is not equal to 3 then only power of 2 is supported. */
4645 gcc_assert (exact_log2 (length) != -1);
4647 for (i = 0, n = nelt / 2; i < n; i++)
4649 sel[i * 2] = i;
4650 sel[i * 2 + 1] = i + nelt;
4652 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4654 for (i = 0; i < nelt; i++)
4655 sel[i] += nelt / 2;
4656 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4658 for (i = 0, n = log_length; i < n; i++)
4660 for (j = 0; j < length/2; j++)
4662 vect1 = dr_chain[j];
4663 vect2 = dr_chain[j+length/2];
4665 /* Create interleaving stmt:
4666 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4667 ...}> */
4668 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4669 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4670 vect2, perm_mask_high);
4671 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4672 (*result_chain)[2*j] = high;
4674 /* Create interleaving stmt:
4675 low = VEC_PERM_EXPR <vect1, vect2,
4676 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4677 ...}> */
4678 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4679 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4680 vect2, perm_mask_low);
4681 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4682 (*result_chain)[2*j+1] = low;
4684 memcpy (dr_chain.address (), result_chain->address (),
4685 length * sizeof (tree));
4690 /* Function vect_setup_realignment
4692 This function is called when vectorizing an unaligned load using
4693 the dr_explicit_realign[_optimized] scheme.
4694 This function generates the following code at the loop prolog:
4696 p = initial_addr;
4697 x msq_init = *(floor(p)); # prolog load
4698 realignment_token = call target_builtin;
4699 loop:
4700 x msq = phi (msq_init, ---)
4702 The stmts marked with x are generated only for the case of
4703 dr_explicit_realign_optimized.
4705 The code above sets up a new (vector) pointer, pointing to the first
4706 location accessed by STMT, and a "floor-aligned" load using that pointer.
4707 It also generates code to compute the "realignment-token" (if the relevant
4708 target hook was defined), and creates a phi-node at the loop-header bb
4709 whose arguments are the result of the prolog-load (created by this
4710 function) and the result of a load that takes place in the loop (to be
4711 created by the caller to this function).
4713 For the case of dr_explicit_realign_optimized:
4714 The caller to this function uses the phi-result (msq) to create the
4715 realignment code inside the loop, and sets up the missing phi argument,
4716 as follows:
4717 loop:
4718 msq = phi (msq_init, lsq)
4719 lsq = *(floor(p')); # load in loop
4720 result = realign_load (msq, lsq, realignment_token);
4722 For the case of dr_explicit_realign:
4723 loop:
4724 msq = *(floor(p)); # load in loop
4725 p' = p + (VS-1);
4726 lsq = *(floor(p')); # load in loop
4727 result = realign_load (msq, lsq, realignment_token);
4729 Input:
4730 STMT - (scalar) load stmt to be vectorized. This load accesses
4731 a memory location that may be unaligned.
4732 BSI - place where new code is to be inserted.
4733 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4734 is used.
4736 Output:
4737 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4738 target hook, if defined.
4739 Return value - the result of the loop-header phi node. */
4741 tree
4742 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4743 tree *realignment_token,
4744 enum dr_alignment_support alignment_support_scheme,
4745 tree init_addr,
4746 struct loop **at_loop)
4748 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4749 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4750 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4751 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4752 struct loop *loop = NULL;
4753 edge pe = NULL;
4754 tree scalar_dest = gimple_assign_lhs (stmt);
4755 tree vec_dest;
4756 gimple inc;
4757 tree ptr;
4758 tree data_ref;
4759 basic_block new_bb;
4760 tree msq_init = NULL_TREE;
4761 tree new_temp;
4762 gphi *phi_stmt;
4763 tree msq = NULL_TREE;
4764 gimple_seq stmts = NULL;
4765 bool inv_p;
4766 bool compute_in_loop = false;
4767 bool nested_in_vect_loop = false;
4768 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4769 struct loop *loop_for_initial_load = NULL;
4771 if (loop_vinfo)
4773 loop = LOOP_VINFO_LOOP (loop_vinfo);
4774 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4777 gcc_assert (alignment_support_scheme == dr_explicit_realign
4778 || alignment_support_scheme == dr_explicit_realign_optimized);
4780 /* We need to generate three things:
4781 1. the misalignment computation
4782 2. the extra vector load (for the optimized realignment scheme).
4783 3. the phi node for the two vectors from which the realignment is
4784 done (for the optimized realignment scheme). */
4786 /* 1. Determine where to generate the misalignment computation.
4788 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4789 calculation will be generated by this function, outside the loop (in the
4790 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4791 caller, inside the loop.
4793 Background: If the misalignment remains fixed throughout the iterations of
4794 the loop, then both realignment schemes are applicable, and also the
4795 misalignment computation can be done outside LOOP. This is because we are
4796 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4797 are a multiple of VS (the Vector Size), and therefore the misalignment in
4798 different vectorized LOOP iterations is always the same.
4799 The problem arises only if the memory access is in an inner-loop nested
4800 inside LOOP, which is now being vectorized using outer-loop vectorization.
4801 This is the only case when the misalignment of the memory access may not
4802 remain fixed throughout the iterations of the inner-loop (as explained in
4803 detail in vect_supportable_dr_alignment). In this case, not only is the
4804 optimized realignment scheme not applicable, but also the misalignment
4805 computation (and generation of the realignment token that is passed to
4806 REALIGN_LOAD) have to be done inside the loop.
4808 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4809 or not, which in turn determines if the misalignment is computed inside
4810 the inner-loop, or outside LOOP. */
4812 if (init_addr != NULL_TREE || !loop_vinfo)
4814 compute_in_loop = true;
4815 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4819 /* 2. Determine where to generate the extra vector load.
4821 For the optimized realignment scheme, instead of generating two vector
4822 loads in each iteration, we generate a single extra vector load in the
4823 preheader of the loop, and in each iteration reuse the result of the
4824 vector load from the previous iteration. In case the memory access is in
4825 an inner-loop nested inside LOOP, which is now being vectorized using
4826 outer-loop vectorization, we need to determine whether this initial vector
4827 load should be generated at the preheader of the inner-loop, or can be
4828 generated at the preheader of LOOP. If the memory access has no evolution
4829 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4830 to be generated inside LOOP (in the preheader of the inner-loop). */
4832 if (nested_in_vect_loop)
4834 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4835 bool invariant_in_outerloop =
4836 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4837 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4839 else
4840 loop_for_initial_load = loop;
4841 if (at_loop)
4842 *at_loop = loop_for_initial_load;
4844 if (loop_for_initial_load)
4845 pe = loop_preheader_edge (loop_for_initial_load);
4847 /* 3. For the case of the optimized realignment, create the first vector
4848 load at the loop preheader. */
4850 if (alignment_support_scheme == dr_explicit_realign_optimized)
4852 /* Create msq_init = *(floor(p1)) in the loop preheader */
4853 gassign *new_stmt;
4855 gcc_assert (!compute_in_loop);
4856 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4857 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4858 NULL_TREE, &init_addr, NULL, &inc,
4859 true, &inv_p);
4860 if (TREE_CODE (ptr) == SSA_NAME)
4861 new_temp = copy_ssa_name (ptr);
4862 else
4863 new_temp = make_ssa_name (TREE_TYPE (ptr));
4864 new_stmt = gimple_build_assign
4865 (new_temp, BIT_AND_EXPR, ptr,
4866 build_int_cst (TREE_TYPE (ptr),
4867 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4868 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4869 gcc_assert (!new_bb);
4870 data_ref
4871 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4872 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4873 new_stmt = gimple_build_assign (vec_dest, data_ref);
4874 new_temp = make_ssa_name (vec_dest, new_stmt);
4875 gimple_assign_set_lhs (new_stmt, new_temp);
4876 if (pe)
4878 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4879 gcc_assert (!new_bb);
4881 else
4882 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4884 msq_init = gimple_assign_lhs (new_stmt);
4887 /* 4. Create realignment token using a target builtin, if available.
4888 It is done either inside the containing loop, or before LOOP (as
4889 determined above). */
4891 if (targetm.vectorize.builtin_mask_for_load)
4893 gcall *new_stmt;
4894 tree builtin_decl;
4896 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4897 if (!init_addr)
4899 /* Generate the INIT_ADDR computation outside LOOP. */
4900 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4901 NULL_TREE, loop);
4902 if (loop)
4904 pe = loop_preheader_edge (loop);
4905 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4906 gcc_assert (!new_bb);
4908 else
4909 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4912 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4913 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4914 vec_dest =
4915 vect_create_destination_var (scalar_dest,
4916 gimple_call_return_type (new_stmt));
4917 new_temp = make_ssa_name (vec_dest, new_stmt);
4918 gimple_call_set_lhs (new_stmt, new_temp);
4920 if (compute_in_loop)
4921 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4922 else
4924 /* Generate the misalignment computation outside LOOP. */
4925 pe = loop_preheader_edge (loop);
4926 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4927 gcc_assert (!new_bb);
4930 *realignment_token = gimple_call_lhs (new_stmt);
4932 /* The result of the CALL_EXPR to this builtin is determined from
4933 the value of the parameter and no global variables are touched
4934 which makes the builtin a "const" function. Requiring the
4935 builtin to have the "const" attribute makes it unnecessary
4936 to call mark_call_clobbered. */
4937 gcc_assert (TREE_READONLY (builtin_decl));
4940 if (alignment_support_scheme == dr_explicit_realign)
4941 return msq;
4943 gcc_assert (!compute_in_loop);
4944 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4947 /* 5. Create msq = phi <msq_init, lsq> in loop */
4949 pe = loop_preheader_edge (containing_loop);
4950 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4951 msq = make_ssa_name (vec_dest);
4952 phi_stmt = create_phi_node (msq, containing_loop->header);
4953 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4955 return msq;
4959 /* Function vect_grouped_load_supported.
4961 Returns TRUE if even and odd permutations are supported,
4962 and FALSE otherwise. */
4964 bool
4965 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4967 machine_mode mode = TYPE_MODE (vectype);
4969 /* vect_permute_load_chain requires the group size to be equal to 3 or
4970 be a power of two. */
4971 if (count != 3 && exact_log2 (count) == -1)
4973 if (dump_enabled_p ())
4974 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4975 "the size of the group of accesses"
4976 " is not a power of 2 or not equal to 3\n");
4977 return false;
4980 /* Check that the permutation is supported. */
4981 if (VECTOR_MODE_P (mode))
4983 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4984 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4986 if (count == 3)
4988 unsigned int k;
4989 for (k = 0; k < 3; k++)
4991 for (i = 0; i < nelt; i++)
4992 if (3 * i + k < 2 * nelt)
4993 sel[i] = 3 * i + k;
4994 else
4995 sel[i] = 0;
4996 if (!can_vec_perm_p (mode, false, sel))
4998 if (dump_enabled_p ())
4999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5000 "shuffle of 3 loads is not supported by"
5001 " target\n");
5002 return false;
5004 for (i = 0, j = 0; i < nelt; i++)
5005 if (3 * i + k < 2 * nelt)
5006 sel[i] = i;
5007 else
5008 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5009 if (!can_vec_perm_p (mode, false, sel))
5011 if (dump_enabled_p ())
5012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5013 "shuffle of 3 loads is not supported by"
5014 " target\n");
5015 return false;
5018 return true;
5020 else
5022 /* If length is not equal to 3 then only power of 2 is supported. */
5023 gcc_assert (exact_log2 (count) != -1);
5024 for (i = 0; i < nelt; i++)
5025 sel[i] = i * 2;
5026 if (can_vec_perm_p (mode, false, sel))
5028 for (i = 0; i < nelt; i++)
5029 sel[i] = i * 2 + 1;
5030 if (can_vec_perm_p (mode, false, sel))
5031 return true;
5036 if (dump_enabled_p ())
5037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5038 "extract even/odd not supported by target\n");
5039 return false;
5042 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5043 type VECTYPE. */
5045 bool
5046 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5048 return vect_lanes_optab_supported_p ("vec_load_lanes",
5049 vec_load_lanes_optab,
5050 vectype, count);
5053 /* Function vect_permute_load_chain.
5055 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5056 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5057 the input data correctly. Return the final references for loads in
5058 RESULT_CHAIN.
5060 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5061 The input is 4 vectors each containing 8 elements. We assign a number to each
5062 element, the input sequence is:
5064 1st vec: 0 1 2 3 4 5 6 7
5065 2nd vec: 8 9 10 11 12 13 14 15
5066 3rd vec: 16 17 18 19 20 21 22 23
5067 4th vec: 24 25 26 27 28 29 30 31
5069 The output sequence should be:
5071 1st vec: 0 4 8 12 16 20 24 28
5072 2nd vec: 1 5 9 13 17 21 25 29
5073 3rd vec: 2 6 10 14 18 22 26 30
5074 4th vec: 3 7 11 15 19 23 27 31
5076 i.e., the first output vector should contain the first elements of each
5077 interleaving group, etc.
5079 We use extract_even/odd instructions to create such output. The input of
5080 each extract_even/odd operation is two vectors
5081 1st vec 2nd vec
5082 0 1 2 3 4 5 6 7
5084 and the output is the vector of extracted even/odd elements. The output of
5085 extract_even will be: 0 2 4 6
5086 and of extract_odd: 1 3 5 7
5089 The permutation is done in log LENGTH stages. In each stage extract_even
5090 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5091 their order. In our example,
5093 E1: extract_even (1st vec, 2nd vec)
5094 E2: extract_odd (1st vec, 2nd vec)
5095 E3: extract_even (3rd vec, 4th vec)
5096 E4: extract_odd (3rd vec, 4th vec)
5098 The output for the first stage will be:
5100 E1: 0 2 4 6 8 10 12 14
5101 E2: 1 3 5 7 9 11 13 15
5102 E3: 16 18 20 22 24 26 28 30
5103 E4: 17 19 21 23 25 27 29 31
5105 In order to proceed and create the correct sequence for the next stage (or
5106 for the correct output, if the second stage is the last one, as in our
5107 example), we first put the output of extract_even operation and then the
5108 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5109 The input for the second stage is:
5111 1st vec (E1): 0 2 4 6 8 10 12 14
5112 2nd vec (E3): 16 18 20 22 24 26 28 30
5113 3rd vec (E2): 1 3 5 7 9 11 13 15
5114 4th vec (E4): 17 19 21 23 25 27 29 31
5116 The output of the second stage:
5118 E1: 0 4 8 12 16 20 24 28
5119 E2: 2 6 10 14 18 22 26 30
5120 E3: 1 5 9 13 17 21 25 29
5121 E4: 3 7 11 15 19 23 27 31
5123 And RESULT_CHAIN after reordering:
5125 1st vec (E1): 0 4 8 12 16 20 24 28
5126 2nd vec (E3): 1 5 9 13 17 21 25 29
5127 3rd vec (E2): 2 6 10 14 18 22 26 30
5128 4th vec (E4): 3 7 11 15 19 23 27 31. */
5130 static void
5131 vect_permute_load_chain (vec<tree> dr_chain,
5132 unsigned int length,
5133 gimple stmt,
5134 gimple_stmt_iterator *gsi,
5135 vec<tree> *result_chain)
5137 tree data_ref, first_vect, second_vect;
5138 tree perm_mask_even, perm_mask_odd;
5139 tree perm3_mask_low, perm3_mask_high;
5140 gimple perm_stmt;
5141 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5142 unsigned int i, j, log_length = exact_log2 (length);
5143 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5144 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5146 result_chain->quick_grow (length);
5147 memcpy (result_chain->address (), dr_chain.address (),
5148 length * sizeof (tree));
5150 if (length == 3)
5152 unsigned int k;
5154 for (k = 0; k < 3; k++)
5156 for (i = 0; i < nelt; i++)
5157 if (3 * i + k < 2 * nelt)
5158 sel[i] = 3 * i + k;
5159 else
5160 sel[i] = 0;
5161 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5163 for (i = 0, j = 0; i < nelt; i++)
5164 if (3 * i + k < 2 * nelt)
5165 sel[i] = i;
5166 else
5167 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5169 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5171 first_vect = dr_chain[0];
5172 second_vect = dr_chain[1];
5174 /* Create interleaving stmt (low part of):
5175 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5176 ...}> */
5177 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5178 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5179 second_vect, perm3_mask_low);
5180 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5182 /* Create interleaving stmt (high part of):
5183 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5184 ...}> */
5185 first_vect = data_ref;
5186 second_vect = dr_chain[2];
5187 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5188 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5189 second_vect, perm3_mask_high);
5190 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5191 (*result_chain)[k] = data_ref;
5194 else
5196 /* If length is not equal to 3 then only power of 2 is supported. */
5197 gcc_assert (exact_log2 (length) != -1);
5199 for (i = 0; i < nelt; ++i)
5200 sel[i] = i * 2;
5201 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5203 for (i = 0; i < nelt; ++i)
5204 sel[i] = i * 2 + 1;
5205 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5207 for (i = 0; i < log_length; i++)
5209 for (j = 0; j < length; j += 2)
5211 first_vect = dr_chain[j];
5212 second_vect = dr_chain[j+1];
5214 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5215 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5216 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5217 first_vect, second_vect,
5218 perm_mask_even);
5219 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5220 (*result_chain)[j/2] = data_ref;
5222 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5223 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5224 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5225 first_vect, second_vect,
5226 perm_mask_odd);
5227 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5228 (*result_chain)[j/2+length/2] = data_ref;
5230 memcpy (dr_chain.address (), result_chain->address (),
5231 length * sizeof (tree));
5236 /* Function vect_shift_permute_load_chain.
5238 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5239 sequence of stmts to reorder the input data accordingly.
5240 Return the final references for loads in RESULT_CHAIN.
5241 Return true if successed, false otherwise.
5243 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5244 The input is 3 vectors each containing 8 elements. We assign a
5245 number to each element, the input sequence is:
5247 1st vec: 0 1 2 3 4 5 6 7
5248 2nd vec: 8 9 10 11 12 13 14 15
5249 3rd vec: 16 17 18 19 20 21 22 23
5251 The output sequence should be:
5253 1st vec: 0 3 6 9 12 15 18 21
5254 2nd vec: 1 4 7 10 13 16 19 22
5255 3rd vec: 2 5 8 11 14 17 20 23
5257 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5259 First we shuffle all 3 vectors to get correct elements order:
5261 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5262 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5263 3rd vec: (16 19 22) (17 20 23) (18 21)
5265 Next we unite and shift vector 3 times:
5267 1st step:
5268 shift right by 6 the concatenation of:
5269 "1st vec" and "2nd vec"
5270 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5271 "2nd vec" and "3rd vec"
5272 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5273 "3rd vec" and "1st vec"
5274 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5275 | New vectors |
5277 So that now new vectors are:
5279 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5280 2nd vec: (10 13) (16 19 22) (17 20 23)
5281 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5283 2nd step:
5284 shift right by 5 the concatenation of:
5285 "1st vec" and "3rd vec"
5286 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5287 "2nd vec" and "1st vec"
5288 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5289 "3rd vec" and "2nd vec"
5290 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5291 | New vectors |
5293 So that now new vectors are:
5295 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5296 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5297 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5299 3rd step:
5300 shift right by 5 the concatenation of:
5301 "1st vec" and "1st vec"
5302 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5303 shift right by 3 the concatenation of:
5304 "2nd vec" and "2nd vec"
5305 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5306 | New vectors |
5308 So that now all vectors are READY:
5309 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5310 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5311 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5313 This algorithm is faster than one in vect_permute_load_chain if:
5314 1. "shift of a concatination" is faster than general permutation.
5315 This is usually so.
5316 2. The TARGET machine can't execute vector instructions in parallel.
5317 This is because each step of the algorithm depends on previous.
5318 The algorithm in vect_permute_load_chain is much more parallel.
5320 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5323 static bool
5324 vect_shift_permute_load_chain (vec<tree> dr_chain,
5325 unsigned int length,
5326 gimple stmt,
5327 gimple_stmt_iterator *gsi,
5328 vec<tree> *result_chain)
5330 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5331 tree perm2_mask1, perm2_mask2, perm3_mask;
5332 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5333 gimple perm_stmt;
5335 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5336 unsigned int i;
5337 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5338 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5339 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5340 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5342 result_chain->quick_grow (length);
5343 memcpy (result_chain->address (), dr_chain.address (),
5344 length * sizeof (tree));
5346 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5348 unsigned int j, log_length = exact_log2 (length);
5349 for (i = 0; i < nelt / 2; ++i)
5350 sel[i] = i * 2;
5351 for (i = 0; i < nelt / 2; ++i)
5352 sel[nelt / 2 + i] = i * 2 + 1;
5353 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5355 if (dump_enabled_p ())
5356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5357 "shuffle of 2 fields structure is not \
5358 supported by target\n");
5359 return false;
5361 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5363 for (i = 0; i < nelt / 2; ++i)
5364 sel[i] = i * 2 + 1;
5365 for (i = 0; i < nelt / 2; ++i)
5366 sel[nelt / 2 + i] = i * 2;
5367 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5369 if (dump_enabled_p ())
5370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5371 "shuffle of 2 fields structure is not \
5372 supported by target\n");
5373 return false;
5375 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5377 /* Generating permutation constant to shift all elements.
5378 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5379 for (i = 0; i < nelt; i++)
5380 sel[i] = nelt / 2 + i;
5381 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5383 if (dump_enabled_p ())
5384 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5385 "shift permutation is not supported by target\n");
5386 return false;
5388 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5390 /* Generating permutation constant to select vector from 2.
5391 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5392 for (i = 0; i < nelt / 2; i++)
5393 sel[i] = i;
5394 for (i = nelt / 2; i < nelt; i++)
5395 sel[i] = nelt + i;
5396 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5398 if (dump_enabled_p ())
5399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5400 "select is not supported by target\n");
5401 return false;
5403 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5405 for (i = 0; i < log_length; i++)
5407 for (j = 0; j < length; j += 2)
5409 first_vect = dr_chain[j];
5410 second_vect = dr_chain[j + 1];
5412 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5413 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5414 first_vect, first_vect,
5415 perm2_mask1);
5416 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5417 vect[0] = data_ref;
5419 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5420 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5421 second_vect, second_vect,
5422 perm2_mask2);
5423 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5424 vect[1] = data_ref;
5426 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5427 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5428 vect[0], vect[1], shift1_mask);
5429 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5430 (*result_chain)[j/2 + length/2] = data_ref;
5432 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5433 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5434 vect[0], vect[1], select_mask);
5435 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5436 (*result_chain)[j/2] = data_ref;
5438 memcpy (dr_chain.address (), result_chain->address (),
5439 length * sizeof (tree));
5441 return true;
5443 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5445 unsigned int k = 0, l = 0;
5447 /* Generating permutation constant to get all elements in rigth order.
5448 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5449 for (i = 0; i < nelt; i++)
5451 if (3 * k + (l % 3) >= nelt)
5453 k = 0;
5454 l += (3 - (nelt % 3));
5456 sel[i] = 3 * k + (l % 3);
5457 k++;
5459 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5461 if (dump_enabled_p ())
5462 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5463 "shuffle of 3 fields structure is not \
5464 supported by target\n");
5465 return false;
5467 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5469 /* Generating permutation constant to shift all elements.
5470 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5471 for (i = 0; i < nelt; i++)
5472 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5473 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5475 if (dump_enabled_p ())
5476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5477 "shift permutation is not supported by target\n");
5478 return false;
5480 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5482 /* Generating permutation constant to shift all elements.
5483 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5484 for (i = 0; i < nelt; i++)
5485 sel[i] = 2 * (nelt / 3) + 1 + i;
5486 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5488 if (dump_enabled_p ())
5489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5490 "shift permutation is not supported by target\n");
5491 return false;
5493 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5495 /* Generating permutation constant to shift all elements.
5496 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5497 for (i = 0; i < nelt; i++)
5498 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5499 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5501 if (dump_enabled_p ())
5502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5503 "shift permutation is not supported by target\n");
5504 return false;
5506 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5508 /* Generating permutation constant to shift all elements.
5509 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5510 for (i = 0; i < nelt; i++)
5511 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5512 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5514 if (dump_enabled_p ())
5515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5516 "shift permutation is not supported by target\n");
5517 return false;
5519 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5521 for (k = 0; k < 3; k++)
5523 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5524 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5525 dr_chain[k], dr_chain[k],
5526 perm3_mask);
5527 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5528 vect[k] = data_ref;
5531 for (k = 0; k < 3; k++)
5533 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5534 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5535 vect[k % 3], vect[(k + 1) % 3],
5536 shift1_mask);
5537 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5538 vect_shift[k] = data_ref;
5541 for (k = 0; k < 3; k++)
5543 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5544 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5545 vect_shift[(4 - k) % 3],
5546 vect_shift[(3 - k) % 3],
5547 shift2_mask);
5548 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5549 vect[k] = data_ref;
5552 (*result_chain)[3 - (nelt % 3)] = vect[2];
5554 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5555 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5556 vect[0], shift3_mask);
5557 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5558 (*result_chain)[nelt % 3] = data_ref;
5560 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5561 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5562 vect[1], shift4_mask);
5563 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5564 (*result_chain)[0] = data_ref;
5565 return true;
5567 return false;
5570 /* Function vect_transform_grouped_load.
5572 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5573 to perform their permutation and ascribe the result vectorized statements to
5574 the scalar statements.
5577 void
5578 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5579 gimple_stmt_iterator *gsi)
5581 machine_mode mode;
5582 vec<tree> result_chain = vNULL;
5584 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5585 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5586 vectors, that are ready for vector computation. */
5587 result_chain.create (size);
5589 /* If reassociation width for vector type is 2 or greater target machine can
5590 execute 2 or more vector instructions in parallel. Otherwise try to
5591 get chain for loads group using vect_shift_permute_load_chain. */
5592 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5593 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5594 || exact_log2 (size) != -1
5595 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5596 gsi, &result_chain))
5597 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5598 vect_record_grouped_load_vectors (stmt, result_chain);
5599 result_chain.release ();
5602 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5603 generated as part of the vectorization of STMT. Assign the statement
5604 for each vector to the associated scalar statement. */
5606 void
5607 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5609 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5610 gimple next_stmt, new_stmt;
5611 unsigned int i, gap_count;
5612 tree tmp_data_ref;
5614 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5615 Since we scan the chain starting from it's first node, their order
5616 corresponds the order of data-refs in RESULT_CHAIN. */
5617 next_stmt = first_stmt;
5618 gap_count = 1;
5619 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5621 if (!next_stmt)
5622 break;
5624 /* Skip the gaps. Loads created for the gaps will be removed by dead
5625 code elimination pass later. No need to check for the first stmt in
5626 the group, since it always exists.
5627 GROUP_GAP is the number of steps in elements from the previous
5628 access (if there is no gap GROUP_GAP is 1). We skip loads that
5629 correspond to the gaps. */
5630 if (next_stmt != first_stmt
5631 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5633 gap_count++;
5634 continue;
5637 while (next_stmt)
5639 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5640 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5641 copies, and we put the new vector statement in the first available
5642 RELATED_STMT. */
5643 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5644 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5645 else
5647 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5649 gimple prev_stmt =
5650 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5651 gimple rel_stmt =
5652 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5653 while (rel_stmt)
5655 prev_stmt = rel_stmt;
5656 rel_stmt =
5657 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5660 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5661 new_stmt;
5665 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5666 gap_count = 1;
5667 /* If NEXT_STMT accesses the same DR as the previous statement,
5668 put the same TMP_DATA_REF as its vectorized statement; otherwise
5669 get the next data-ref from RESULT_CHAIN. */
5670 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5671 break;
5676 /* Function vect_force_dr_alignment_p.
5678 Returns whether the alignment of a DECL can be forced to be aligned
5679 on ALIGNMENT bit boundary. */
5681 bool
5682 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5684 if (TREE_CODE (decl) != VAR_DECL)
5685 return false;
5687 if (decl_in_symtab_p (decl)
5688 && !symtab_node::get (decl)->can_increase_alignment_p ())
5689 return false;
5691 if (TREE_STATIC (decl))
5692 return (alignment <= MAX_OFILE_ALIGNMENT);
5693 else
5694 return (alignment <= MAX_STACK_ALIGNMENT);
5698 /* Return whether the data reference DR is supported with respect to its
5699 alignment.
5700 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5701 it is aligned, i.e., check if it is possible to vectorize it with different
5702 alignment. */
5704 enum dr_alignment_support
5705 vect_supportable_dr_alignment (struct data_reference *dr,
5706 bool check_aligned_accesses)
5708 gimple stmt = DR_STMT (dr);
5709 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5710 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5711 machine_mode mode = TYPE_MODE (vectype);
5712 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5713 struct loop *vect_loop = NULL;
5714 bool nested_in_vect_loop = false;
5716 if (aligned_access_p (dr) && !check_aligned_accesses)
5717 return dr_aligned;
5719 /* For now assume all conditional loads/stores support unaligned
5720 access without any special code. */
5721 if (is_gimple_call (stmt)
5722 && gimple_call_internal_p (stmt)
5723 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5724 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5725 return dr_unaligned_supported;
5727 if (loop_vinfo)
5729 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5730 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5733 /* Possibly unaligned access. */
5735 /* We can choose between using the implicit realignment scheme (generating
5736 a misaligned_move stmt) and the explicit realignment scheme (generating
5737 aligned loads with a REALIGN_LOAD). There are two variants to the
5738 explicit realignment scheme: optimized, and unoptimized.
5739 We can optimize the realignment only if the step between consecutive
5740 vector loads is equal to the vector size. Since the vector memory
5741 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5742 is guaranteed that the misalignment amount remains the same throughout the
5743 execution of the vectorized loop. Therefore, we can create the
5744 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5745 at the loop preheader.
5747 However, in the case of outer-loop vectorization, when vectorizing a
5748 memory access in the inner-loop nested within the LOOP that is now being
5749 vectorized, while it is guaranteed that the misalignment of the
5750 vectorized memory access will remain the same in different outer-loop
5751 iterations, it is *not* guaranteed that is will remain the same throughout
5752 the execution of the inner-loop. This is because the inner-loop advances
5753 with the original scalar step (and not in steps of VS). If the inner-loop
5754 step happens to be a multiple of VS, then the misalignment remains fixed
5755 and we can use the optimized realignment scheme. For example:
5757 for (i=0; i<N; i++)
5758 for (j=0; j<M; j++)
5759 s += a[i+j];
5761 When vectorizing the i-loop in the above example, the step between
5762 consecutive vector loads is 1, and so the misalignment does not remain
5763 fixed across the execution of the inner-loop, and the realignment cannot
5764 be optimized (as illustrated in the following pseudo vectorized loop):
5766 for (i=0; i<N; i+=4)
5767 for (j=0; j<M; j++){
5768 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5769 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5770 // (assuming that we start from an aligned address).
5773 We therefore have to use the unoptimized realignment scheme:
5775 for (i=0; i<N; i+=4)
5776 for (j=k; j<M; j+=4)
5777 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5778 // that the misalignment of the initial address is
5779 // 0).
5781 The loop can then be vectorized as follows:
5783 for (k=0; k<4; k++){
5784 rt = get_realignment_token (&vp[k]);
5785 for (i=0; i<N; i+=4){
5786 v1 = vp[i+k];
5787 for (j=k; j<M; j+=4){
5788 v2 = vp[i+j+VS-1];
5789 va = REALIGN_LOAD <v1,v2,rt>;
5790 vs += va;
5791 v1 = v2;
5794 } */
5796 if (DR_IS_READ (dr))
5798 bool is_packed = false;
5799 tree type = (TREE_TYPE (DR_REF (dr)));
5801 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5802 && (!targetm.vectorize.builtin_mask_for_load
5803 || targetm.vectorize.builtin_mask_for_load ()))
5805 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5806 if ((nested_in_vect_loop
5807 && (TREE_INT_CST_LOW (DR_STEP (dr))
5808 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5809 || !loop_vinfo)
5810 return dr_explicit_realign;
5811 else
5812 return dr_explicit_realign_optimized;
5814 if (!known_alignment_for_access_p (dr))
5815 is_packed = not_size_aligned (DR_REF (dr));
5817 if ((TYPE_USER_ALIGN (type) && !is_packed)
5818 || targetm.vectorize.
5819 support_vector_misalignment (mode, type,
5820 DR_MISALIGNMENT (dr), is_packed))
5821 /* Can't software pipeline the loads, but can at least do them. */
5822 return dr_unaligned_supported;
5824 else
5826 bool is_packed = false;
5827 tree type = (TREE_TYPE (DR_REF (dr)));
5829 if (!known_alignment_for_access_p (dr))
5830 is_packed = not_size_aligned (DR_REF (dr));
5832 if ((TYPE_USER_ALIGN (type) && !is_packed)
5833 || targetm.vectorize.
5834 support_vector_misalignment (mode, type,
5835 DR_MISALIGNMENT (dr), is_packed))
5836 return dr_unaligned_supported;
5839 /* Unsupported. */
5840 return dr_unaligned_unsupported;