* lto-partition.c (add_symbol_to_partition_1,
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobe973b34a1a79b5538f72342bad8ff8975d2c4903
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2014 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "stor-layout.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-ssa-alias.h"
34 #include "internal-fn.h"
35 #include "tree-eh.h"
36 #include "gimple-expr.h"
37 #include "is-a.h"
38 #include "gimple.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "gimple-ssa.h"
43 #include "tree-phinodes.h"
44 #include "ssa-iterators.h"
45 #include "stringpool.h"
46 #include "tree-ssanames.h"
47 #include "tree-ssa-loop-ivopts.h"
48 #include "tree-ssa-loop-manip.h"
49 #include "tree-ssa-loop.h"
50 #include "dumpfile.h"
51 #include "cfgloop.h"
52 #include "tree-chrec.h"
53 #include "tree-scalar-evolution.h"
54 #include "tree-vectorizer.h"
55 #include "diagnostic-core.h"
56 #include "cgraph.h"
57 /* Need to include rtl.h, expr.h, etc. for optabs. */
58 #include "expr.h"
59 #include "optabs.h"
61 /* Return true if load- or store-lanes optab OPTAB is implemented for
62 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
64 static bool
65 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
66 tree vectype, unsigned HOST_WIDE_INT count)
68 enum machine_mode mode, array_mode;
69 bool limit_p;
71 mode = TYPE_MODE (vectype);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
74 MODE_INT, limit_p);
76 if (array_mode == BLKmode)
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
80 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
81 GET_MODE_NAME (mode), count);
82 return false;
85 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
89 "cannot use %s<%s><%s>\n", name,
90 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
91 return false;
94 if (dump_enabled_p ())
95 dump_printf_loc (MSG_NOTE, vect_location,
96 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
97 GET_MODE_NAME (mode));
99 return true;
103 /* Return the smallest scalar part of STMT.
104 This is used to determine the vectype of the stmt. We generally set the
105 vectype according to the type of the result (lhs). For stmts whose
106 result-type is different than the type of the arguments (e.g., demotion,
107 promotion), vectype will be reset appropriately (later). Note that we have
108 to visit the smallest datatype in this function, because that determines the
109 VF. If the smallest datatype in the loop is present only as the rhs of a
110 promotion operation - we'd miss it.
111 Such a case, where a variable of this datatype does not appear in the lhs
112 anywhere in the loop, can only occur if it's an invariant: e.g.:
113 'int_x = (int) short_inv', which we'd expect to have been optimized away by
114 invariant motion. However, we cannot rely on invariant motion to always
115 take invariants out of the loop, and so in the case of promotion we also
116 have to check the rhs.
117 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
118 types. */
120 tree
121 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
122 HOST_WIDE_INT *rhs_size_unit)
124 tree scalar_type = gimple_expr_type (stmt);
125 HOST_WIDE_INT lhs, rhs;
127 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
129 if (is_gimple_assign (stmt)
130 && (gimple_assign_cast_p (stmt)
131 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
132 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
133 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
135 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
137 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
138 if (rhs < lhs)
139 scalar_type = rhs_type;
142 *lhs_size_unit = lhs;
143 *rhs_size_unit = rhs;
144 return scalar_type;
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149 tested at run-time. Return TRUE if DDR was successfully inserted.
150 Return false if versioning is not supported. */
152 static bool
153 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
155 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
157 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
158 return false;
160 if (dump_enabled_p ())
162 dump_printf_loc (MSG_NOTE, vect_location,
163 "mark for run-time aliasing test between ");
164 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
165 dump_printf (MSG_NOTE, " and ");
166 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
167 dump_printf (MSG_NOTE, "\n");
170 if (optimize_loop_nest_for_size_p (loop))
172 if (dump_enabled_p ())
173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
174 "versioning not supported when optimizing"
175 " for size.\n");
176 return false;
179 /* FORNOW: We don't support versioning with outer-loop vectorization. */
180 if (loop->inner)
182 if (dump_enabled_p ())
183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
184 "versioning not yet supported for outer-loops.\n");
185 return false;
188 /* FORNOW: We don't support creating runtime alias tests for non-constant
189 step. */
190 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
191 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
193 if (dump_enabled_p ())
194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
195 "versioning not yet supported for non-constant "
196 "step\n");
197 return false;
200 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
201 return true;
205 /* Function vect_analyze_data_ref_dependence.
207 Return TRUE if there (might) exist a dependence between a memory-reference
208 DRA and a memory-reference DRB. When versioning for alias may check a
209 dependence at run-time, return FALSE. Adjust *MAX_VF according to
210 the data dependence. */
212 static bool
213 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
214 loop_vec_info loop_vinfo, int *max_vf)
216 unsigned int i;
217 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
218 struct data_reference *dra = DDR_A (ddr);
219 struct data_reference *drb = DDR_B (ddr);
220 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
221 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
222 lambda_vector dist_v;
223 unsigned int loop_depth;
225 /* In loop analysis all data references should be vectorizable. */
226 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
227 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
228 gcc_unreachable ();
230 /* Independent data accesses. */
231 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
232 return false;
234 if (dra == drb
235 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
236 return false;
238 /* Even if we have an anti-dependence then, as the vectorized loop covers at
239 least two scalar iterations, there is always also a true dependence.
240 As the vectorizer does not re-order loads and stores we can ignore
241 the anti-dependence if TBAA can disambiguate both DRs similar to the
242 case with known negative distance anti-dependences (positive
243 distance anti-dependences would violate TBAA constraints). */
244 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
245 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
246 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
247 get_alias_set (DR_REF (drb))))
248 return false;
250 /* Unknown data dependence. */
251 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
253 /* If user asserted safelen consecutive iterations can be
254 executed concurrently, assume independence. */
255 if (loop->safelen >= 2)
257 if (loop->safelen < *max_vf)
258 *max_vf = loop->safelen;
259 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
260 return false;
263 if (STMT_VINFO_GATHER_P (stmtinfo_a)
264 || STMT_VINFO_GATHER_P (stmtinfo_b))
266 if (dump_enabled_p ())
268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
269 "versioning for alias not supported for: "
270 "can't determine dependence between ");
271 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
272 DR_REF (dra));
273 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
274 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
275 DR_REF (drb));
276 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
278 return true;
281 if (dump_enabled_p ())
283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
284 "versioning for alias required: "
285 "can't determine dependence between ");
286 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
287 DR_REF (dra));
288 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
289 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
290 DR_REF (drb));
291 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
294 /* Add to list of ddrs that need to be tested at run-time. */
295 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
298 /* Known data dependence. */
299 if (DDR_NUM_DIST_VECTS (ddr) == 0)
301 /* If user asserted safelen consecutive iterations can be
302 executed concurrently, assume independence. */
303 if (loop->safelen >= 2)
305 if (loop->safelen < *max_vf)
306 *max_vf = loop->safelen;
307 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
308 return false;
311 if (STMT_VINFO_GATHER_P (stmtinfo_a)
312 || STMT_VINFO_GATHER_P (stmtinfo_b))
314 if (dump_enabled_p ())
316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
317 "versioning for alias not supported for: "
318 "bad dist vector for ");
319 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
320 DR_REF (dra));
321 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
323 DR_REF (drb));
324 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
326 return true;
329 if (dump_enabled_p ())
331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
332 "versioning for alias required: "
333 "bad dist vector for ");
334 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
335 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
336 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
337 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
339 /* Add to list of ddrs that need to be tested at run-time. */
340 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
343 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
344 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
346 int dist = dist_v[loop_depth];
348 if (dump_enabled_p ())
349 dump_printf_loc (MSG_NOTE, vect_location,
350 "dependence distance = %d.\n", dist);
352 if (dist == 0)
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_NOTE, vect_location,
357 "dependence distance == 0 between ");
358 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
359 dump_printf (MSG_NOTE, " and ");
360 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
361 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
364 /* When we perform grouped accesses and perform implicit CSE
365 by detecting equal accesses and doing disambiguation with
366 runtime alias tests like for
367 .. = a[i];
368 .. = a[i+1];
369 a[i] = ..;
370 a[i+1] = ..;
371 *p = ..;
372 .. = a[i];
373 .. = a[i+1];
374 where we will end up loading { a[i], a[i+1] } once, make
375 sure that inserting group loads before the first load and
376 stores after the last store will do the right thing. */
377 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
378 && GROUP_SAME_DR_STMT (stmtinfo_a))
379 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
380 && GROUP_SAME_DR_STMT (stmtinfo_b)))
382 gimple earlier_stmt;
383 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
384 if (DR_IS_WRITE
385 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
387 if (dump_enabled_p ())
388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
389 "READ_WRITE dependence in interleaving."
390 "\n");
391 return true;
395 continue;
398 if (dist > 0 && DDR_REVERSED_P (ddr))
400 /* If DDR_REVERSED_P the order of the data-refs in DDR was
401 reversed (to make distance vector positive), and the actual
402 distance is negative. */
403 if (dump_enabled_p ())
404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
405 "dependence distance negative.\n");
406 continue;
409 if (abs (dist) >= 2
410 && abs (dist) < *max_vf)
412 /* The dependence distance requires reduction of the maximal
413 vectorization factor. */
414 *max_vf = abs (dist);
415 if (dump_enabled_p ())
416 dump_printf_loc (MSG_NOTE, vect_location,
417 "adjusting maximal vectorization factor to %i\n",
418 *max_vf);
421 if (abs (dist) >= *max_vf)
423 /* Dependence distance does not create dependence, as far as
424 vectorization is concerned, in this case. */
425 if (dump_enabled_p ())
426 dump_printf_loc (MSG_NOTE, vect_location,
427 "dependence distance >= VF.\n");
428 continue;
431 if (dump_enabled_p ())
433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
434 "not vectorized, possible dependence "
435 "between data-refs ");
436 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
437 dump_printf (MSG_NOTE, " and ");
438 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
439 dump_printf (MSG_NOTE, "\n");
442 return true;
445 return false;
448 /* Function vect_analyze_data_ref_dependences.
450 Examine all the data references in the loop, and make sure there do not
451 exist any data dependences between them. Set *MAX_VF according to
452 the maximum vectorization factor the data dependences allow. */
454 bool
455 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
457 unsigned int i;
458 struct data_dependence_relation *ddr;
460 if (dump_enabled_p ())
461 dump_printf_loc (MSG_NOTE, vect_location,
462 "=== vect_analyze_data_ref_dependences ===\n");
464 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
465 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
466 &LOOP_VINFO_DDRS (loop_vinfo),
467 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
468 return false;
470 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
471 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
472 return false;
474 return true;
478 /* Function vect_slp_analyze_data_ref_dependence.
480 Return TRUE if there (might) exist a dependence between a memory-reference
481 DRA and a memory-reference DRB. When versioning for alias may check a
482 dependence at run-time, return FALSE. Adjust *MAX_VF according to
483 the data dependence. */
485 static bool
486 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
488 struct data_reference *dra = DDR_A (ddr);
489 struct data_reference *drb = DDR_B (ddr);
491 /* We need to check dependences of statements marked as unvectorizable
492 as well, they still can prohibit vectorization. */
494 /* Independent data accesses. */
495 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
496 return false;
498 if (dra == drb)
499 return false;
501 /* Read-read is OK. */
502 if (DR_IS_READ (dra) && DR_IS_READ (drb))
503 return false;
505 /* If dra and drb are part of the same interleaving chain consider
506 them independent. */
507 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
508 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
509 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
510 return false;
512 /* Unknown data dependence. */
513 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
515 if (dump_enabled_p ())
517 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
518 "can't determine dependence between ");
519 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
520 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
521 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
522 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
525 else if (dump_enabled_p ())
527 dump_printf_loc (MSG_NOTE, vect_location,
528 "determined dependence between ");
529 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
530 dump_printf (MSG_NOTE, " and ");
531 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
532 dump_printf (MSG_NOTE, "\n");
535 /* We do not vectorize basic blocks with write-write dependencies. */
536 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
537 return true;
539 /* If we have a read-write dependence check that the load is before the store.
540 When we vectorize basic blocks, vector load can be only before
541 corresponding scalar load, and vector store can be only after its
542 corresponding scalar store. So the order of the acceses is preserved in
543 case the load is before the store. */
544 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
545 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
547 /* That only holds for load-store pairs taking part in vectorization. */
548 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
549 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
550 return false;
553 return true;
557 /* Function vect_analyze_data_ref_dependences.
559 Examine all the data references in the basic-block, and make sure there
560 do not exist any data dependences between them. Set *MAX_VF according to
561 the maximum vectorization factor the data dependences allow. */
563 bool
564 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
566 struct data_dependence_relation *ddr;
567 unsigned int i;
569 if (dump_enabled_p ())
570 dump_printf_loc (MSG_NOTE, vect_location,
571 "=== vect_slp_analyze_data_ref_dependences ===\n");
573 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
574 &BB_VINFO_DDRS (bb_vinfo),
575 vNULL, true))
576 return false;
578 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
579 if (vect_slp_analyze_data_ref_dependence (ddr))
580 return false;
582 return true;
586 /* Function vect_compute_data_ref_alignment
588 Compute the misalignment of the data reference DR.
590 Output:
591 1. If during the misalignment computation it is found that the data reference
592 cannot be vectorized then false is returned.
593 2. DR_MISALIGNMENT (DR) is defined.
595 FOR NOW: No analysis is actually performed. Misalignment is calculated
596 only for trivial cases. TODO. */
598 static bool
599 vect_compute_data_ref_alignment (struct data_reference *dr)
601 gimple stmt = DR_STMT (dr);
602 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
603 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
604 struct loop *loop = NULL;
605 tree ref = DR_REF (dr);
606 tree vectype;
607 tree base, base_addr;
608 bool base_aligned;
609 tree misalign;
610 tree aligned_to, alignment;
612 if (dump_enabled_p ())
613 dump_printf_loc (MSG_NOTE, vect_location,
614 "vect_compute_data_ref_alignment:\n");
616 if (loop_vinfo)
617 loop = LOOP_VINFO_LOOP (loop_vinfo);
619 /* Initialize misalignment to unknown. */
620 SET_DR_MISALIGNMENT (dr, -1);
622 /* Strided loads perform only component accesses, misalignment information
623 is irrelevant for them. */
624 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
625 return true;
627 misalign = DR_INIT (dr);
628 aligned_to = DR_ALIGNED_TO (dr);
629 base_addr = DR_BASE_ADDRESS (dr);
630 vectype = STMT_VINFO_VECTYPE (stmt_info);
632 /* In case the dataref is in an inner-loop of the loop that is being
633 vectorized (LOOP), we use the base and misalignment information
634 relative to the outer-loop (LOOP). This is ok only if the misalignment
635 stays the same throughout the execution of the inner-loop, which is why
636 we have to check that the stride of the dataref in the inner-loop evenly
637 divides by the vector size. */
638 if (loop && nested_in_vect_loop_p (loop, stmt))
640 tree step = DR_STEP (dr);
641 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
643 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
645 if (dump_enabled_p ())
646 dump_printf_loc (MSG_NOTE, vect_location,
647 "inner step divides the vector-size.\n");
648 misalign = STMT_VINFO_DR_INIT (stmt_info);
649 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
650 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
652 else
654 if (dump_enabled_p ())
655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
656 "inner step doesn't divide the vector-size.\n");
657 misalign = NULL_TREE;
661 /* Similarly, if we're doing basic-block vectorization, we can only use
662 base and misalignment information relative to an innermost loop if the
663 misalignment stays the same throughout the execution of the loop.
664 As above, this is the case if the stride of the dataref evenly divides
665 by the vector size. */
666 if (!loop)
668 tree step = DR_STEP (dr);
669 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
671 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
673 if (dump_enabled_p ())
674 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
675 "SLP: step doesn't divide the vector-size.\n");
676 misalign = NULL_TREE;
680 base = build_fold_indirect_ref (base_addr);
681 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
683 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
684 || !misalign)
686 if (dump_enabled_p ())
688 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
689 "Unknown alignment for access: ");
690 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
691 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
693 return true;
696 if ((DECL_P (base)
697 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
698 alignment) >= 0)
699 || (TREE_CODE (base_addr) == SSA_NAME
700 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
701 TREE_TYPE (base_addr)))),
702 alignment) >= 0)
703 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
704 base_aligned = true;
705 else
706 base_aligned = false;
708 if (!base_aligned)
710 /* Do not change the alignment of global variables here if
711 flag_section_anchors is enabled as we already generated
712 RTL for other functions. Most global variables should
713 have been aligned during the IPA increase_alignment pass. */
714 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
715 || (TREE_STATIC (base) && flag_section_anchors))
717 if (dump_enabled_p ())
719 dump_printf_loc (MSG_NOTE, vect_location,
720 "can't force alignment of ref: ");
721 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
722 dump_printf (MSG_NOTE, "\n");
724 return true;
727 /* Force the alignment of the decl.
728 NOTE: This is the only change to the code we make during
729 the analysis phase, before deciding to vectorize the loop. */
730 if (dump_enabled_p ())
732 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
733 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
734 dump_printf (MSG_NOTE, "\n");
737 ((dataref_aux *)dr->aux)->base_decl = base;
738 ((dataref_aux *)dr->aux)->base_misaligned = true;
741 /* If this is a backward running DR then first access in the larger
742 vectype actually is N-1 elements before the address in the DR.
743 Adjust misalign accordingly. */
744 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
746 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
747 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
748 otherwise we wouldn't be here. */
749 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
750 /* PLUS because DR_STEP was negative. */
751 misalign = size_binop (PLUS_EXPR, misalign, offset);
754 /* Modulo alignment. */
755 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
757 if (!tree_fits_uhwi_p (misalign))
759 /* Negative or overflowed misalignment value. */
760 if (dump_enabled_p ())
761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
762 "unexpected misalign value\n");
763 return false;
766 SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
768 if (dump_enabled_p ())
770 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
771 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
772 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
773 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
776 return true;
780 /* Function vect_compute_data_refs_alignment
782 Compute the misalignment of data references in the loop.
783 Return FALSE if a data reference is found that cannot be vectorized. */
785 static bool
786 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
787 bb_vec_info bb_vinfo)
789 vec<data_reference_p> datarefs;
790 struct data_reference *dr;
791 unsigned int i;
793 if (loop_vinfo)
794 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
795 else
796 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
798 FOR_EACH_VEC_ELT (datarefs, i, dr)
799 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
800 && !vect_compute_data_ref_alignment (dr))
802 if (bb_vinfo)
804 /* Mark unsupported statement as unvectorizable. */
805 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
806 continue;
808 else
809 return false;
812 return true;
816 /* Function vect_update_misalignment_for_peel
818 DR - the data reference whose misalignment is to be adjusted.
819 DR_PEEL - the data reference whose misalignment is being made
820 zero in the vector loop by the peel.
821 NPEEL - the number of iterations in the peel loop if the misalignment
822 of DR_PEEL is known at compile time. */
824 static void
825 vect_update_misalignment_for_peel (struct data_reference *dr,
826 struct data_reference *dr_peel, int npeel)
828 unsigned int i;
829 vec<dr_p> same_align_drs;
830 struct data_reference *current_dr;
831 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
832 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
833 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
834 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
836 /* For interleaved data accesses the step in the loop must be multiplied by
837 the size of the interleaving group. */
838 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
839 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
840 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
841 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
843 /* It can be assumed that the data refs with the same alignment as dr_peel
844 are aligned in the vector loop. */
845 same_align_drs
846 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
847 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
849 if (current_dr != dr)
850 continue;
851 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
852 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
853 SET_DR_MISALIGNMENT (dr, 0);
854 return;
857 if (known_alignment_for_access_p (dr)
858 && known_alignment_for_access_p (dr_peel))
860 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
861 int misal = DR_MISALIGNMENT (dr);
862 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
863 misal += negative ? -npeel * dr_size : npeel * dr_size;
864 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
865 SET_DR_MISALIGNMENT (dr, misal);
866 return;
869 if (dump_enabled_p ())
870 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
871 SET_DR_MISALIGNMENT (dr, -1);
875 /* Function vect_verify_datarefs_alignment
877 Return TRUE if all data references in the loop can be
878 handled with respect to alignment. */
880 bool
881 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
883 vec<data_reference_p> datarefs;
884 struct data_reference *dr;
885 enum dr_alignment_support supportable_dr_alignment;
886 unsigned int i;
888 if (loop_vinfo)
889 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
890 else
891 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
893 FOR_EACH_VEC_ELT (datarefs, i, dr)
895 gimple stmt = DR_STMT (dr);
896 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
898 if (!STMT_VINFO_RELEVANT_P (stmt_info))
899 continue;
901 /* For interleaving, only the alignment of the first access matters.
902 Skip statements marked as not vectorizable. */
903 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
904 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
905 || !STMT_VINFO_VECTORIZABLE (stmt_info))
906 continue;
908 /* Strided loads perform only component accesses, alignment is
909 irrelevant for them. */
910 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
911 continue;
913 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
914 if (!supportable_dr_alignment)
916 if (dump_enabled_p ())
918 if (DR_IS_READ (dr))
919 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
920 "not vectorized: unsupported unaligned load.");
921 else
922 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
923 "not vectorized: unsupported unaligned "
924 "store.");
926 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
927 DR_REF (dr));
928 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
930 return false;
932 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
933 dump_printf_loc (MSG_NOTE, vect_location,
934 "Vectorizing an unaligned access.\n");
936 return true;
939 /* Given an memory reference EXP return whether its alignment is less
940 than its size. */
942 static bool
943 not_size_aligned (tree exp)
945 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
946 return true;
948 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
949 > get_object_alignment (exp));
952 /* Function vector_alignment_reachable_p
954 Return true if vector alignment for DR is reachable by peeling
955 a few loop iterations. Return false otherwise. */
957 static bool
958 vector_alignment_reachable_p (struct data_reference *dr)
960 gimple stmt = DR_STMT (dr);
961 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
962 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
964 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
966 /* For interleaved access we peel only if number of iterations in
967 the prolog loop ({VF - misalignment}), is a multiple of the
968 number of the interleaved accesses. */
969 int elem_size, mis_in_elements;
970 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
972 /* FORNOW: handle only known alignment. */
973 if (!known_alignment_for_access_p (dr))
974 return false;
976 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
977 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
979 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
980 return false;
983 /* If misalignment is known at the compile time then allow peeling
984 only if natural alignment is reachable through peeling. */
985 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
987 HOST_WIDE_INT elmsize =
988 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
989 if (dump_enabled_p ())
991 dump_printf_loc (MSG_NOTE, vect_location,
992 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
993 dump_printf (MSG_NOTE,
994 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
996 if (DR_MISALIGNMENT (dr) % elmsize)
998 if (dump_enabled_p ())
999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1000 "data size does not divide the misalignment.\n");
1001 return false;
1005 if (!known_alignment_for_access_p (dr))
1007 tree type = TREE_TYPE (DR_REF (dr));
1008 bool is_packed = not_size_aligned (DR_REF (dr));
1009 if (dump_enabled_p ())
1010 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1011 "Unknown misalignment, is_packed = %d\n",is_packed);
1012 if ((TYPE_USER_ALIGN (type) && !is_packed)
1013 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1014 return true;
1015 else
1016 return false;
1019 return true;
1023 /* Calculate the cost of the memory access represented by DR. */
1025 static void
1026 vect_get_data_access_cost (struct data_reference *dr,
1027 unsigned int *inside_cost,
1028 unsigned int *outside_cost,
1029 stmt_vector_for_cost *body_cost_vec)
1031 gimple stmt = DR_STMT (dr);
1032 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1033 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1034 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1035 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1036 int ncopies = vf / nunits;
1038 if (DR_IS_READ (dr))
1039 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1040 NULL, body_cost_vec, false);
1041 else
1042 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_NOTE, vect_location,
1046 "vect_get_data_access_cost: inside_cost = %d, "
1047 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1051 /* Insert DR into peeling hash table with NPEEL as key. */
1053 static void
1054 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1055 int npeel)
1057 struct _vect_peel_info elem, *slot;
1058 _vect_peel_info **new_slot;
1059 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1061 elem.npeel = npeel;
1062 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
1063 if (slot)
1064 slot->count++;
1065 else
1067 slot = XNEW (struct _vect_peel_info);
1068 slot->npeel = npeel;
1069 slot->dr = dr;
1070 slot->count = 1;
1071 new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
1072 *new_slot = slot;
1075 if (!supportable_dr_alignment
1076 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1077 slot->count += VECT_MAX_COST;
1081 /* Traverse peeling hash table to find peeling option that aligns maximum
1082 number of data accesses. */
1085 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1086 _vect_peel_extended_info *max)
1088 vect_peel_info elem = *slot;
1090 if (elem->count > max->peel_info.count
1091 || (elem->count == max->peel_info.count
1092 && max->peel_info.npeel > elem->npeel))
1094 max->peel_info.npeel = elem->npeel;
1095 max->peel_info.count = elem->count;
1096 max->peel_info.dr = elem->dr;
1099 return 1;
1103 /* Traverse peeling hash table and calculate cost for each peeling option.
1104 Find the one with the lowest cost. */
1107 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1108 _vect_peel_extended_info *min)
1110 vect_peel_info elem = *slot;
1111 int save_misalignment, dummy;
1112 unsigned int inside_cost = 0, outside_cost = 0, i;
1113 gimple stmt = DR_STMT (elem->dr);
1114 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1115 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1116 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1117 struct data_reference *dr;
1118 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1119 int single_iter_cost;
1121 prologue_cost_vec.create (2);
1122 body_cost_vec.create (2);
1123 epilogue_cost_vec.create (2);
1125 FOR_EACH_VEC_ELT (datarefs, i, dr)
1127 stmt = DR_STMT (dr);
1128 stmt_info = vinfo_for_stmt (stmt);
1129 /* For interleaving, only the alignment of the first access
1130 matters. */
1131 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1132 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1133 continue;
1135 save_misalignment = DR_MISALIGNMENT (dr);
1136 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1137 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1138 &body_cost_vec);
1139 SET_DR_MISALIGNMENT (dr, save_misalignment);
1142 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1143 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1144 &dummy, single_iter_cost,
1145 &prologue_cost_vec,
1146 &epilogue_cost_vec);
1148 /* Prologue and epilogue costs are added to the target model later.
1149 These costs depend only on the scalar iteration cost, the
1150 number of peeling iterations finally chosen, and the number of
1151 misaligned statements. So discard the information found here. */
1152 prologue_cost_vec.release ();
1153 epilogue_cost_vec.release ();
1155 if (inside_cost < min->inside_cost
1156 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1158 min->inside_cost = inside_cost;
1159 min->outside_cost = outside_cost;
1160 min->body_cost_vec.release ();
1161 min->body_cost_vec = body_cost_vec;
1162 min->peel_info.dr = elem->dr;
1163 min->peel_info.npeel = elem->npeel;
1165 else
1166 body_cost_vec.release ();
1168 return 1;
1172 /* Choose best peeling option by traversing peeling hash table and either
1173 choosing an option with the lowest cost (if cost model is enabled) or the
1174 option that aligns as many accesses as possible. */
1176 static struct data_reference *
1177 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1178 unsigned int *npeel,
1179 stmt_vector_for_cost *body_cost_vec)
1181 struct _vect_peel_extended_info res;
1183 res.peel_info.dr = NULL;
1184 res.body_cost_vec = stmt_vector_for_cost ();
1186 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1188 res.inside_cost = INT_MAX;
1189 res.outside_cost = INT_MAX;
1190 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1191 .traverse <_vect_peel_extended_info *,
1192 vect_peeling_hash_get_lowest_cost> (&res);
1194 else
1196 res.peel_info.count = 0;
1197 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1198 .traverse <_vect_peel_extended_info *,
1199 vect_peeling_hash_get_most_frequent> (&res);
1202 *npeel = res.peel_info.npeel;
1203 *body_cost_vec = res.body_cost_vec;
1204 return res.peel_info.dr;
1208 /* Function vect_enhance_data_refs_alignment
1210 This pass will use loop versioning and loop peeling in order to enhance
1211 the alignment of data references in the loop.
1213 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1214 original loop is to be vectorized. Any other loops that are created by
1215 the transformations performed in this pass - are not supposed to be
1216 vectorized. This restriction will be relaxed.
1218 This pass will require a cost model to guide it whether to apply peeling
1219 or versioning or a combination of the two. For example, the scheme that
1220 intel uses when given a loop with several memory accesses, is as follows:
1221 choose one memory access ('p') which alignment you want to force by doing
1222 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1223 other accesses are not necessarily aligned, or (2) use loop versioning to
1224 generate one loop in which all accesses are aligned, and another loop in
1225 which only 'p' is necessarily aligned.
1227 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1228 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1229 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1231 Devising a cost model is the most critical aspect of this work. It will
1232 guide us on which access to peel for, whether to use loop versioning, how
1233 many versions to create, etc. The cost model will probably consist of
1234 generic considerations as well as target specific considerations (on
1235 powerpc for example, misaligned stores are more painful than misaligned
1236 loads).
1238 Here are the general steps involved in alignment enhancements:
1240 -- original loop, before alignment analysis:
1241 for (i=0; i<N; i++){
1242 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1243 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1246 -- After vect_compute_data_refs_alignment:
1247 for (i=0; i<N; i++){
1248 x = q[i]; # DR_MISALIGNMENT(q) = 3
1249 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1252 -- Possibility 1: we do loop versioning:
1253 if (p is aligned) {
1254 for (i=0; i<N; i++){ # loop 1A
1255 x = q[i]; # DR_MISALIGNMENT(q) = 3
1256 p[i] = y; # DR_MISALIGNMENT(p) = 0
1259 else {
1260 for (i=0; i<N; i++){ # loop 1B
1261 x = q[i]; # DR_MISALIGNMENT(q) = 3
1262 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1266 -- Possibility 2: we do loop peeling:
1267 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1268 x = q[i];
1269 p[i] = y;
1271 for (i = 3; i < N; i++){ # loop 2A
1272 x = q[i]; # DR_MISALIGNMENT(q) = 0
1273 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1276 -- Possibility 3: combination of loop peeling and versioning:
1277 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1278 x = q[i];
1279 p[i] = y;
1281 if (p is aligned) {
1282 for (i = 3; i<N; i++){ # loop 3A
1283 x = q[i]; # DR_MISALIGNMENT(q) = 0
1284 p[i] = y; # DR_MISALIGNMENT(p) = 0
1287 else {
1288 for (i = 3; i<N; i++){ # loop 3B
1289 x = q[i]; # DR_MISALIGNMENT(q) = 0
1290 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1294 These loops are later passed to loop_transform to be vectorized. The
1295 vectorizer will use the alignment information to guide the transformation
1296 (whether to generate regular loads/stores, or with special handling for
1297 misalignment). */
1299 bool
1300 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1302 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1303 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1304 enum dr_alignment_support supportable_dr_alignment;
1305 struct data_reference *dr0 = NULL, *first_store = NULL;
1306 struct data_reference *dr;
1307 unsigned int i, j;
1308 bool do_peeling = false;
1309 bool do_versioning = false;
1310 bool stat;
1311 gimple stmt;
1312 stmt_vec_info stmt_info;
1313 unsigned int npeel = 0;
1314 bool all_misalignments_unknown = true;
1315 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1316 unsigned possible_npeel_number = 1;
1317 tree vectype;
1318 unsigned int nelements, mis, same_align_drs_max = 0;
1319 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1321 if (dump_enabled_p ())
1322 dump_printf_loc (MSG_NOTE, vect_location,
1323 "=== vect_enhance_data_refs_alignment ===\n");
1325 /* While cost model enhancements are expected in the future, the high level
1326 view of the code at this time is as follows:
1328 A) If there is a misaligned access then see if peeling to align
1329 this access can make all data references satisfy
1330 vect_supportable_dr_alignment. If so, update data structures
1331 as needed and return true.
1333 B) If peeling wasn't possible and there is a data reference with an
1334 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1335 then see if loop versioning checks can be used to make all data
1336 references satisfy vect_supportable_dr_alignment. If so, update
1337 data structures as needed and return true.
1339 C) If neither peeling nor versioning were successful then return false if
1340 any data reference does not satisfy vect_supportable_dr_alignment.
1342 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1344 Note, Possibility 3 above (which is peeling and versioning together) is not
1345 being done at this time. */
1347 /* (1) Peeling to force alignment. */
1349 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1350 Considerations:
1351 + How many accesses will become aligned due to the peeling
1352 - How many accesses will become unaligned due to the peeling,
1353 and the cost of misaligned accesses.
1354 - The cost of peeling (the extra runtime checks, the increase
1355 in code size). */
1357 FOR_EACH_VEC_ELT (datarefs, i, dr)
1359 stmt = DR_STMT (dr);
1360 stmt_info = vinfo_for_stmt (stmt);
1362 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1363 continue;
1365 /* For interleaving, only the alignment of the first access
1366 matters. */
1367 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1368 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1369 continue;
1371 /* For invariant accesses there is nothing to enhance. */
1372 if (integer_zerop (DR_STEP (dr)))
1373 continue;
1375 /* Strided loads perform only component accesses, alignment is
1376 irrelevant for them. */
1377 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1378 continue;
1380 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1381 do_peeling = vector_alignment_reachable_p (dr);
1382 if (do_peeling)
1384 if (known_alignment_for_access_p (dr))
1386 unsigned int npeel_tmp;
1387 bool negative = tree_int_cst_compare (DR_STEP (dr),
1388 size_zero_node) < 0;
1390 /* Save info about DR in the hash table. */
1391 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1392 LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
1394 vectype = STMT_VINFO_VECTYPE (stmt_info);
1395 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1396 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1397 TREE_TYPE (DR_REF (dr))));
1398 npeel_tmp = (negative
1399 ? (mis - nelements) : (nelements - mis))
1400 & (nelements - 1);
1402 /* For multiple types, it is possible that the bigger type access
1403 will have more than one peeling option. E.g., a loop with two
1404 types: one of size (vector size / 4), and the other one of
1405 size (vector size / 8). Vectorization factor will 8. If both
1406 access are misaligned by 3, the first one needs one scalar
1407 iteration to be aligned, and the second one needs 5. But the
1408 the first one will be aligned also by peeling 5 scalar
1409 iterations, and in that case both accesses will be aligned.
1410 Hence, except for the immediate peeling amount, we also want
1411 to try to add full vector size, while we don't exceed
1412 vectorization factor.
1413 We do this automtically for cost model, since we calculate cost
1414 for every peeling option. */
1415 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1416 possible_npeel_number = vf /nelements;
1418 /* Handle the aligned case. We may decide to align some other
1419 access, making DR unaligned. */
1420 if (DR_MISALIGNMENT (dr) == 0)
1422 npeel_tmp = 0;
1423 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1424 possible_npeel_number++;
1427 for (j = 0; j < possible_npeel_number; j++)
1429 gcc_assert (npeel_tmp <= vf);
1430 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1431 npeel_tmp += nelements;
1434 all_misalignments_unknown = false;
1435 /* Data-ref that was chosen for the case that all the
1436 misalignments are unknown is not relevant anymore, since we
1437 have a data-ref with known alignment. */
1438 dr0 = NULL;
1440 else
1442 /* If we don't know any misalignment values, we prefer
1443 peeling for data-ref that has the maximum number of data-refs
1444 with the same alignment, unless the target prefers to align
1445 stores over load. */
1446 if (all_misalignments_unknown)
1448 unsigned same_align_drs
1449 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1450 if (!dr0
1451 || same_align_drs_max < same_align_drs)
1453 same_align_drs_max = same_align_drs;
1454 dr0 = dr;
1456 /* For data-refs with the same number of related
1457 accesses prefer the one where the misalign
1458 computation will be invariant in the outermost loop. */
1459 else if (same_align_drs_max == same_align_drs)
1461 struct loop *ivloop0, *ivloop;
1462 ivloop0 = outermost_invariant_loop_for_expr
1463 (loop, DR_BASE_ADDRESS (dr0));
1464 ivloop = outermost_invariant_loop_for_expr
1465 (loop, DR_BASE_ADDRESS (dr));
1466 if ((ivloop && !ivloop0)
1467 || (ivloop && ivloop0
1468 && flow_loop_nested_p (ivloop, ivloop0)))
1469 dr0 = dr;
1472 if (!first_store && DR_IS_WRITE (dr))
1473 first_store = dr;
1476 /* If there are both known and unknown misaligned accesses in the
1477 loop, we choose peeling amount according to the known
1478 accesses. */
1479 if (!supportable_dr_alignment)
1481 dr0 = dr;
1482 if (!first_store && DR_IS_WRITE (dr))
1483 first_store = dr;
1487 else
1489 if (!aligned_access_p (dr))
1491 if (dump_enabled_p ())
1492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1493 "vector alignment may not be reachable\n");
1494 break;
1499 /* Check if we can possibly peel the loop. */
1500 if (!vect_can_advance_ivs_p (loop_vinfo)
1501 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1502 do_peeling = false;
1504 if (do_peeling && all_misalignments_unknown
1505 && vect_supportable_dr_alignment (dr0, false))
1508 /* Check if the target requires to prefer stores over loads, i.e., if
1509 misaligned stores are more expensive than misaligned loads (taking
1510 drs with same alignment into account). */
1511 if (first_store && DR_IS_READ (dr0))
1513 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1514 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1515 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1516 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1517 stmt_vector_for_cost dummy;
1518 dummy.create (2);
1520 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1521 &dummy);
1522 vect_get_data_access_cost (first_store, &store_inside_cost,
1523 &store_outside_cost, &dummy);
1525 dummy.release ();
1527 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1528 aligning the load DR0). */
1529 load_inside_penalty = store_inside_cost;
1530 load_outside_penalty = store_outside_cost;
1531 for (i = 0;
1532 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1533 DR_STMT (first_store))).iterate (i, &dr);
1534 i++)
1535 if (DR_IS_READ (dr))
1537 load_inside_penalty += load_inside_cost;
1538 load_outside_penalty += load_outside_cost;
1540 else
1542 load_inside_penalty += store_inside_cost;
1543 load_outside_penalty += store_outside_cost;
1546 /* Calculate the penalty for leaving DR0 unaligned (by
1547 aligning the FIRST_STORE). */
1548 store_inside_penalty = load_inside_cost;
1549 store_outside_penalty = load_outside_cost;
1550 for (i = 0;
1551 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1552 DR_STMT (dr0))).iterate (i, &dr);
1553 i++)
1554 if (DR_IS_READ (dr))
1556 store_inside_penalty += load_inside_cost;
1557 store_outside_penalty += load_outside_cost;
1559 else
1561 store_inside_penalty += store_inside_cost;
1562 store_outside_penalty += store_outside_cost;
1565 if (load_inside_penalty > store_inside_penalty
1566 || (load_inside_penalty == store_inside_penalty
1567 && load_outside_penalty > store_outside_penalty))
1568 dr0 = first_store;
1571 /* In case there are only loads with different unknown misalignments, use
1572 peeling only if it may help to align other accesses in the loop. */
1573 if (!first_store
1574 && !STMT_VINFO_SAME_ALIGN_REFS (
1575 vinfo_for_stmt (DR_STMT (dr0))).length ()
1576 && vect_supportable_dr_alignment (dr0, false)
1577 != dr_unaligned_supported)
1578 do_peeling = false;
1581 if (do_peeling && !dr0)
1583 /* Peeling is possible, but there is no data access that is not supported
1584 unless aligned. So we try to choose the best possible peeling. */
1586 /* We should get here only if there are drs with known misalignment. */
1587 gcc_assert (!all_misalignments_unknown);
1589 /* Choose the best peeling from the hash table. */
1590 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1591 &body_cost_vec);
1592 if (!dr0 || !npeel)
1593 do_peeling = false;
1596 if (do_peeling)
1598 stmt = DR_STMT (dr0);
1599 stmt_info = vinfo_for_stmt (stmt);
1600 vectype = STMT_VINFO_VECTYPE (stmt_info);
1601 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1603 if (known_alignment_for_access_p (dr0))
1605 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1606 size_zero_node) < 0;
1607 if (!npeel)
1609 /* Since it's known at compile time, compute the number of
1610 iterations in the peeled loop (the peeling factor) for use in
1611 updating DR_MISALIGNMENT values. The peeling factor is the
1612 vectorization factor minus the misalignment as an element
1613 count. */
1614 mis = DR_MISALIGNMENT (dr0);
1615 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1616 npeel = ((negative ? mis - nelements : nelements - mis)
1617 & (nelements - 1));
1620 /* For interleaved data access every iteration accesses all the
1621 members of the group, therefore we divide the number of iterations
1622 by the group size. */
1623 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1624 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1625 npeel /= GROUP_SIZE (stmt_info);
1627 if (dump_enabled_p ())
1628 dump_printf_loc (MSG_NOTE, vect_location,
1629 "Try peeling by %d\n", npeel);
1632 /* Ensure that all data refs can be vectorized after the peel. */
1633 FOR_EACH_VEC_ELT (datarefs, i, dr)
1635 int save_misalignment;
1637 if (dr == dr0)
1638 continue;
1640 stmt = DR_STMT (dr);
1641 stmt_info = vinfo_for_stmt (stmt);
1642 /* For interleaving, only the alignment of the first access
1643 matters. */
1644 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1645 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1646 continue;
1648 /* Strided loads perform only component accesses, alignment is
1649 irrelevant for them. */
1650 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1651 continue;
1653 save_misalignment = DR_MISALIGNMENT (dr);
1654 vect_update_misalignment_for_peel (dr, dr0, npeel);
1655 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1656 SET_DR_MISALIGNMENT (dr, save_misalignment);
1658 if (!supportable_dr_alignment)
1660 do_peeling = false;
1661 break;
1665 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1667 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1668 if (!stat)
1669 do_peeling = false;
1670 else
1672 body_cost_vec.release ();
1673 return stat;
1677 if (do_peeling)
1679 unsigned max_allowed_peel
1680 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1681 if (max_allowed_peel != (unsigned)-1)
1683 unsigned max_peel = npeel;
1684 if (max_peel == 0)
1686 gimple dr_stmt = DR_STMT (dr0);
1687 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1688 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1689 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1691 if (max_peel > max_allowed_peel)
1693 do_peeling = false;
1694 if (dump_enabled_p ())
1695 dump_printf_loc (MSG_NOTE, vect_location,
1696 "Disable peeling, max peels reached: %d\n", max_peel);
1701 if (do_peeling)
1703 stmt_info_for_cost *si;
1704 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1706 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1707 If the misalignment of DR_i is identical to that of dr0 then set
1708 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1709 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1710 by the peeling factor times the element size of DR_i (MOD the
1711 vectorization factor times the size). Otherwise, the
1712 misalignment of DR_i must be set to unknown. */
1713 FOR_EACH_VEC_ELT (datarefs, i, dr)
1714 if (dr != dr0)
1715 vect_update_misalignment_for_peel (dr, dr0, npeel);
1717 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1718 if (npeel)
1719 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1720 else
1721 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1722 = DR_MISALIGNMENT (dr0);
1723 SET_DR_MISALIGNMENT (dr0, 0);
1724 if (dump_enabled_p ())
1726 dump_printf_loc (MSG_NOTE, vect_location,
1727 "Alignment of access forced using peeling.\n");
1728 dump_printf_loc (MSG_NOTE, vect_location,
1729 "Peeling for alignment will be applied.\n");
1731 /* We've delayed passing the inside-loop peeling costs to the
1732 target cost model until we were sure peeling would happen.
1733 Do so now. */
1734 if (body_cost_vec.exists ())
1736 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1738 struct _stmt_vec_info *stmt_info
1739 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1740 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1741 si->misalign, vect_body);
1743 body_cost_vec.release ();
1746 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1747 gcc_assert (stat);
1748 return stat;
1752 body_cost_vec.release ();
1754 /* (2) Versioning to force alignment. */
1756 /* Try versioning if:
1757 1) optimize loop for speed
1758 2) there is at least one unsupported misaligned data ref with an unknown
1759 misalignment, and
1760 3) all misaligned data refs with a known misalignment are supported, and
1761 4) the number of runtime alignment checks is within reason. */
1763 do_versioning =
1764 optimize_loop_nest_for_speed_p (loop)
1765 && (!loop->inner); /* FORNOW */
1767 if (do_versioning)
1769 FOR_EACH_VEC_ELT (datarefs, i, dr)
1771 stmt = DR_STMT (dr);
1772 stmt_info = vinfo_for_stmt (stmt);
1774 /* For interleaving, only the alignment of the first access
1775 matters. */
1776 if (aligned_access_p (dr)
1777 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1778 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1779 continue;
1781 /* Strided loads perform only component accesses, alignment is
1782 irrelevant for them. */
1783 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1784 continue;
1786 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1788 if (!supportable_dr_alignment)
1790 gimple stmt;
1791 int mask;
1792 tree vectype;
1794 if (known_alignment_for_access_p (dr)
1795 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1796 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1798 do_versioning = false;
1799 break;
1802 stmt = DR_STMT (dr);
1803 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1804 gcc_assert (vectype);
1806 /* The rightmost bits of an aligned address must be zeros.
1807 Construct the mask needed for this test. For example,
1808 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1809 mask must be 15 = 0xf. */
1810 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1812 /* FORNOW: use the same mask to test all potentially unaligned
1813 references in the loop. The vectorizer currently supports
1814 a single vector size, see the reference to
1815 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1816 vectorization factor is computed. */
1817 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1818 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1819 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1820 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1821 DR_STMT (dr));
1825 /* Versioning requires at least one misaligned data reference. */
1826 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1827 do_versioning = false;
1828 else if (!do_versioning)
1829 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1832 if (do_versioning)
1834 vec<gimple> may_misalign_stmts
1835 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1836 gimple stmt;
1838 /* It can now be assumed that the data references in the statements
1839 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1840 of the loop being vectorized. */
1841 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1843 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1844 dr = STMT_VINFO_DATA_REF (stmt_info);
1845 SET_DR_MISALIGNMENT (dr, 0);
1846 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_NOTE, vect_location,
1848 "Alignment of access forced using versioning.\n");
1851 if (dump_enabled_p ())
1852 dump_printf_loc (MSG_NOTE, vect_location,
1853 "Versioning for alignment will be applied.\n");
1855 /* Peeling and versioning can't be done together at this time. */
1856 gcc_assert (! (do_peeling && do_versioning));
1858 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1859 gcc_assert (stat);
1860 return stat;
1863 /* This point is reached if neither peeling nor versioning is being done. */
1864 gcc_assert (! (do_peeling || do_versioning));
1866 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1867 return stat;
1871 /* Function vect_find_same_alignment_drs.
1873 Update group and alignment relations according to the chosen
1874 vectorization factor. */
1876 static void
1877 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1878 loop_vec_info loop_vinfo)
1880 unsigned int i;
1881 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1882 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1883 struct data_reference *dra = DDR_A (ddr);
1884 struct data_reference *drb = DDR_B (ddr);
1885 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1886 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1887 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1888 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1889 lambda_vector dist_v;
1890 unsigned int loop_depth;
1892 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1893 return;
1895 if (dra == drb)
1896 return;
1898 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1899 return;
1901 /* Loop-based vectorization and known data dependence. */
1902 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1903 return;
1905 /* Data-dependence analysis reports a distance vector of zero
1906 for data-references that overlap only in the first iteration
1907 but have different sign step (see PR45764).
1908 So as a sanity check require equal DR_STEP. */
1909 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1910 return;
1912 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1913 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1915 int dist = dist_v[loop_depth];
1917 if (dump_enabled_p ())
1918 dump_printf_loc (MSG_NOTE, vect_location,
1919 "dependence distance = %d.\n", dist);
1921 /* Same loop iteration. */
1922 if (dist == 0
1923 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1925 /* Two references with distance zero have the same alignment. */
1926 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1927 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1928 if (dump_enabled_p ())
1930 dump_printf_loc (MSG_NOTE, vect_location,
1931 "accesses have the same alignment.\n");
1932 dump_printf (MSG_NOTE,
1933 "dependence distance modulo vf == 0 between ");
1934 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1935 dump_printf (MSG_NOTE, " and ");
1936 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1937 dump_printf (MSG_NOTE, "\n");
1944 /* Function vect_analyze_data_refs_alignment
1946 Analyze the alignment of the data-references in the loop.
1947 Return FALSE if a data reference is found that cannot be vectorized. */
1949 bool
1950 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1951 bb_vec_info bb_vinfo)
1953 if (dump_enabled_p ())
1954 dump_printf_loc (MSG_NOTE, vect_location,
1955 "=== vect_analyze_data_refs_alignment ===\n");
1957 /* Mark groups of data references with same alignment using
1958 data dependence information. */
1959 if (loop_vinfo)
1961 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1962 struct data_dependence_relation *ddr;
1963 unsigned int i;
1965 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1966 vect_find_same_alignment_drs (ddr, loop_vinfo);
1969 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1971 if (dump_enabled_p ())
1972 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1973 "not vectorized: can't calculate alignment "
1974 "for data ref.\n");
1975 return false;
1978 return true;
1982 /* Analyze groups of accesses: check that DR belongs to a group of
1983 accesses of legal size, step, etc. Detect gaps, single element
1984 interleaving, and other special cases. Set grouped access info.
1985 Collect groups of strided stores for further use in SLP analysis. */
1987 static bool
1988 vect_analyze_group_access (struct data_reference *dr)
1990 tree step = DR_STEP (dr);
1991 tree scalar_type = TREE_TYPE (DR_REF (dr));
1992 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
1993 gimple stmt = DR_STMT (dr);
1994 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1995 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1996 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
1997 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
1998 HOST_WIDE_INT groupsize, last_accessed_element = 1;
1999 bool slp_impossible = false;
2000 struct loop *loop = NULL;
2002 if (loop_vinfo)
2003 loop = LOOP_VINFO_LOOP (loop_vinfo);
2005 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2006 size of the interleaving group (including gaps). */
2007 groupsize = absu_hwi (dr_step) / type_size;
2009 /* Not consecutive access is possible only if it is a part of interleaving. */
2010 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2012 /* Check if it this DR is a part of interleaving, and is a single
2013 element of the group that is accessed in the loop. */
2015 /* Gaps are supported only for loads. STEP must be a multiple of the type
2016 size. The size of the group must be a power of 2. */
2017 if (DR_IS_READ (dr)
2018 && (dr_step % type_size) == 0
2019 && groupsize > 0
2020 && exact_log2 (groupsize) != -1)
2022 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2023 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2024 if (dump_enabled_p ())
2026 dump_printf_loc (MSG_NOTE, vect_location,
2027 "Detected single element interleaving ");
2028 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2029 dump_printf (MSG_NOTE, " step ");
2030 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2031 dump_printf (MSG_NOTE, "\n");
2034 if (loop_vinfo)
2036 if (dump_enabled_p ())
2037 dump_printf_loc (MSG_NOTE, vect_location,
2038 "Data access with gaps requires scalar "
2039 "epilogue loop\n");
2040 if (loop->inner)
2042 if (dump_enabled_p ())
2043 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2044 "Peeling for outer loop is not"
2045 " supported\n");
2046 return false;
2049 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2052 return true;
2055 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2058 "not consecutive access ");
2059 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2060 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2063 if (bb_vinfo)
2065 /* Mark the statement as unvectorizable. */
2066 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2067 return true;
2070 return false;
2073 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2075 /* First stmt in the interleaving chain. Check the chain. */
2076 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2077 struct data_reference *data_ref = dr;
2078 unsigned int count = 1;
2079 tree prev_init = DR_INIT (data_ref);
2080 gimple prev = stmt;
2081 HOST_WIDE_INT diff, gaps = 0;
2082 unsigned HOST_WIDE_INT count_in_bytes;
2084 while (next)
2086 /* Skip same data-refs. In case that two or more stmts share
2087 data-ref (supported only for loads), we vectorize only the first
2088 stmt, and the rest get their vectorized loads from the first
2089 one. */
2090 if (!tree_int_cst_compare (DR_INIT (data_ref),
2091 DR_INIT (STMT_VINFO_DATA_REF (
2092 vinfo_for_stmt (next)))))
2094 if (DR_IS_WRITE (data_ref))
2096 if (dump_enabled_p ())
2097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098 "Two store stmts share the same dr.\n");
2099 return false;
2102 /* For load use the same data-ref load. */
2103 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2105 prev = next;
2106 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2107 continue;
2110 prev = next;
2111 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2113 /* All group members have the same STEP by construction. */
2114 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2116 /* Check that the distance between two accesses is equal to the type
2117 size. Otherwise, we have gaps. */
2118 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2119 - TREE_INT_CST_LOW (prev_init)) / type_size;
2120 if (diff != 1)
2122 /* FORNOW: SLP of accesses with gaps is not supported. */
2123 slp_impossible = true;
2124 if (DR_IS_WRITE (data_ref))
2126 if (dump_enabled_p ())
2127 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2128 "interleaved store with gaps\n");
2129 return false;
2132 gaps += diff - 1;
2135 last_accessed_element += diff;
2137 /* Store the gap from the previous member of the group. If there is no
2138 gap in the access, GROUP_GAP is always 1. */
2139 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2141 prev_init = DR_INIT (data_ref);
2142 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2143 /* Count the number of data-refs in the chain. */
2144 count++;
2147 /* COUNT is the number of accesses found, we multiply it by the size of
2148 the type to get COUNT_IN_BYTES. */
2149 count_in_bytes = type_size * count;
2151 /* Check that the size of the interleaving (including gaps) is not
2152 greater than STEP. */
2153 if (dr_step != 0
2154 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2156 if (dump_enabled_p ())
2158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2159 "interleaving size is greater than step for ");
2160 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2161 DR_REF (dr));
2162 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2164 return false;
2167 /* Check that the size of the interleaving is equal to STEP for stores,
2168 i.e., that there are no gaps. */
2169 if (dr_step != 0
2170 && absu_hwi (dr_step) != count_in_bytes)
2172 if (DR_IS_READ (dr))
2174 slp_impossible = true;
2175 /* There is a gap after the last load in the group. This gap is a
2176 difference between the groupsize and the number of elements.
2177 When there is no gap, this difference should be 0. */
2178 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2180 else
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184 "interleaved store with gaps\n");
2185 return false;
2189 /* Check that STEP is a multiple of type size. */
2190 if (dr_step != 0
2191 && (dr_step % type_size) != 0)
2193 if (dump_enabled_p ())
2195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2196 "step is not a multiple of type size: step ");
2197 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2198 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2199 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2200 TYPE_SIZE_UNIT (scalar_type));
2201 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2203 return false;
2206 if (groupsize == 0)
2207 groupsize = count;
2209 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2210 if (dump_enabled_p ())
2211 dump_printf_loc (MSG_NOTE, vect_location,
2212 "Detected interleaving of size %d\n", (int)groupsize);
2214 /* SLP: create an SLP data structure for every interleaving group of
2215 stores for further analysis in vect_analyse_slp. */
2216 if (DR_IS_WRITE (dr) && !slp_impossible)
2218 if (loop_vinfo)
2219 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2220 if (bb_vinfo)
2221 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2224 /* There is a gap in the end of the group. */
2225 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2227 if (dump_enabled_p ())
2228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2229 "Data access with gaps requires scalar "
2230 "epilogue loop\n");
2231 if (loop->inner)
2233 if (dump_enabled_p ())
2234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2235 "Peeling for outer loop is not supported\n");
2236 return false;
2239 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2243 return true;
2247 /* Analyze the access pattern of the data-reference DR.
2248 In case of non-consecutive accesses call vect_analyze_group_access() to
2249 analyze groups of accesses. */
2251 static bool
2252 vect_analyze_data_ref_access (struct data_reference *dr)
2254 tree step = DR_STEP (dr);
2255 tree scalar_type = TREE_TYPE (DR_REF (dr));
2256 gimple stmt = DR_STMT (dr);
2257 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2258 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2259 struct loop *loop = NULL;
2261 if (loop_vinfo)
2262 loop = LOOP_VINFO_LOOP (loop_vinfo);
2264 if (loop_vinfo && !step)
2266 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2268 "bad data-ref access in loop\n");
2269 return false;
2272 /* Allow invariant loads in not nested loops. */
2273 if (loop_vinfo && integer_zerop (step))
2275 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2276 if (nested_in_vect_loop_p (loop, stmt))
2278 if (dump_enabled_p ())
2279 dump_printf_loc (MSG_NOTE, vect_location,
2280 "zero step in inner loop of nest\n");
2281 return false;
2283 return DR_IS_READ (dr);
2286 if (loop && nested_in_vect_loop_p (loop, stmt))
2288 /* Interleaved accesses are not yet supported within outer-loop
2289 vectorization for references in the inner-loop. */
2290 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2292 /* For the rest of the analysis we use the outer-loop step. */
2293 step = STMT_VINFO_DR_STEP (stmt_info);
2294 if (integer_zerop (step))
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_NOTE, vect_location,
2298 "zero step in outer loop.\n");
2299 if (DR_IS_READ (dr))
2300 return true;
2301 else
2302 return false;
2306 /* Consecutive? */
2307 if (TREE_CODE (step) == INTEGER_CST)
2309 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2310 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2311 || (dr_step < 0
2312 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2314 /* Mark that it is not interleaving. */
2315 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2316 return true;
2320 if (loop && nested_in_vect_loop_p (loop, stmt))
2322 if (dump_enabled_p ())
2323 dump_printf_loc (MSG_NOTE, vect_location,
2324 "grouped access in outer loop.\n");
2325 return false;
2328 /* Assume this is a DR handled by non-constant strided load case. */
2329 if (TREE_CODE (step) != INTEGER_CST)
2330 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2332 /* Not consecutive access - check if it's a part of interleaving group. */
2333 return vect_analyze_group_access (dr);
2338 /* A helper function used in the comparator function to sort data
2339 references. T1 and T2 are two data references to be compared.
2340 The function returns -1, 0, or 1. */
2342 static int
2343 compare_tree (tree t1, tree t2)
2345 int i, cmp;
2346 enum tree_code code;
2347 char tclass;
2349 if (t1 == t2)
2350 return 0;
2351 if (t1 == NULL)
2352 return -1;
2353 if (t2 == NULL)
2354 return 1;
2357 if (TREE_CODE (t1) != TREE_CODE (t2))
2358 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2360 code = TREE_CODE (t1);
2361 switch (code)
2363 /* For const values, we can just use hash values for comparisons. */
2364 case INTEGER_CST:
2365 case REAL_CST:
2366 case FIXED_CST:
2367 case STRING_CST:
2368 case COMPLEX_CST:
2369 case VECTOR_CST:
2371 hashval_t h1 = iterative_hash_expr (t1, 0);
2372 hashval_t h2 = iterative_hash_expr (t2, 0);
2373 if (h1 != h2)
2374 return h1 < h2 ? -1 : 1;
2375 break;
2378 case SSA_NAME:
2379 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2380 if (cmp != 0)
2381 return cmp;
2383 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2384 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2385 break;
2387 default:
2388 tclass = TREE_CODE_CLASS (code);
2390 /* For var-decl, we could compare their UIDs. */
2391 if (tclass == tcc_declaration)
2393 if (DECL_UID (t1) != DECL_UID (t2))
2394 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2395 break;
2398 /* For expressions with operands, compare their operands recursively. */
2399 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2401 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2402 if (cmp != 0)
2403 return cmp;
2407 return 0;
2411 /* Compare two data-references DRA and DRB to group them into chunks
2412 suitable for grouping. */
2414 static int
2415 dr_group_sort_cmp (const void *dra_, const void *drb_)
2417 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2418 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2419 int cmp;
2421 /* Stabilize sort. */
2422 if (dra == drb)
2423 return 0;
2425 /* Ordering of DRs according to base. */
2426 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2428 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2429 if (cmp != 0)
2430 return cmp;
2433 /* And according to DR_OFFSET. */
2434 if (!dr_equal_offsets_p (dra, drb))
2436 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2437 if (cmp != 0)
2438 return cmp;
2441 /* Put reads before writes. */
2442 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2443 return DR_IS_READ (dra) ? -1 : 1;
2445 /* Then sort after access size. */
2446 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2447 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2449 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2450 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2451 if (cmp != 0)
2452 return cmp;
2455 /* And after step. */
2456 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2458 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2459 if (cmp != 0)
2460 return cmp;
2463 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2464 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2465 if (cmp == 0)
2466 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2467 return cmp;
2470 /* Function vect_analyze_data_ref_accesses.
2472 Analyze the access pattern of all the data references in the loop.
2474 FORNOW: the only access pattern that is considered vectorizable is a
2475 simple step 1 (consecutive) access.
2477 FORNOW: handle only arrays and pointer accesses. */
2479 bool
2480 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2482 unsigned int i;
2483 vec<data_reference_p> datarefs;
2484 struct data_reference *dr;
2486 if (dump_enabled_p ())
2487 dump_printf_loc (MSG_NOTE, vect_location,
2488 "=== vect_analyze_data_ref_accesses ===\n");
2490 if (loop_vinfo)
2491 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2492 else
2493 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2495 if (datarefs.is_empty ())
2496 return true;
2498 /* Sort the array of datarefs to make building the interleaving chains
2499 linear. Don't modify the original vector's order, it is needed for
2500 determining what dependencies are reversed. */
2501 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2502 qsort (datarefs_copy.address (), datarefs_copy.length (),
2503 sizeof (data_reference_p), dr_group_sort_cmp);
2505 /* Build the interleaving chains. */
2506 for (i = 0; i < datarefs_copy.length () - 1;)
2508 data_reference_p dra = datarefs_copy[i];
2509 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2510 stmt_vec_info lastinfo = NULL;
2511 for (i = i + 1; i < datarefs_copy.length (); ++i)
2513 data_reference_p drb = datarefs_copy[i];
2514 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2516 /* ??? Imperfect sorting (non-compatible types, non-modulo
2517 accesses, same accesses) can lead to a group to be artificially
2518 split here as we don't just skip over those. If it really
2519 matters we can push those to a worklist and re-iterate
2520 over them. The we can just skip ahead to the next DR here. */
2522 /* Check that the data-refs have same first location (except init)
2523 and they are both either store or load (not load and store). */
2524 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2525 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2526 DR_BASE_ADDRESS (drb), 0)
2527 || !dr_equal_offsets_p (dra, drb))
2528 break;
2530 /* Check that the data-refs have the same constant size and step. */
2531 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2532 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2533 if (!tree_fits_uhwi_p (sza)
2534 || !tree_fits_uhwi_p (szb)
2535 || !tree_int_cst_equal (sza, szb)
2536 || !tree_fits_shwi_p (DR_STEP (dra))
2537 || !tree_fits_shwi_p (DR_STEP (drb))
2538 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2539 break;
2541 /* Do not place the same access in the interleaving chain twice. */
2542 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2543 break;
2545 /* Check the types are compatible.
2546 ??? We don't distinguish this during sorting. */
2547 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2548 TREE_TYPE (DR_REF (drb))))
2549 break;
2551 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2552 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2553 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2554 gcc_assert (init_a < init_b);
2556 /* If init_b == init_a + the size of the type * k, we have an
2557 interleaving, and DRA is accessed before DRB. */
2558 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2559 if ((init_b - init_a) % type_size_a != 0)
2560 break;
2562 /* The step (if not zero) is greater than the difference between
2563 data-refs' inits. This splits groups into suitable sizes. */
2564 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2565 if (step != 0 && step <= (init_b - init_a))
2566 break;
2568 if (dump_enabled_p ())
2570 dump_printf_loc (MSG_NOTE, vect_location,
2571 "Detected interleaving ");
2572 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2573 dump_printf (MSG_NOTE, " and ");
2574 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2575 dump_printf (MSG_NOTE, "\n");
2578 /* Link the found element into the group list. */
2579 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2581 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2582 lastinfo = stmtinfo_a;
2584 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2585 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2586 lastinfo = stmtinfo_b;
2590 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2591 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2592 && !vect_analyze_data_ref_access (dr))
2594 if (dump_enabled_p ())
2595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2596 "not vectorized: complicated access pattern.\n");
2598 if (bb_vinfo)
2600 /* Mark the statement as not vectorizable. */
2601 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2602 continue;
2604 else
2606 datarefs_copy.release ();
2607 return false;
2611 datarefs_copy.release ();
2612 return true;
2616 /* Operator == between two dr_with_seg_len objects.
2618 This equality operator is used to make sure two data refs
2619 are the same one so that we will consider to combine the
2620 aliasing checks of those two pairs of data dependent data
2621 refs. */
2623 static bool
2624 operator == (const dr_with_seg_len& d1,
2625 const dr_with_seg_len& d2)
2627 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2628 DR_BASE_ADDRESS (d2.dr), 0)
2629 && compare_tree (d1.offset, d2.offset) == 0
2630 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2633 /* Function comp_dr_with_seg_len_pair.
2635 Comparison function for sorting objects of dr_with_seg_len_pair_t
2636 so that we can combine aliasing checks in one scan. */
2638 static int
2639 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2641 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2642 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2644 const dr_with_seg_len &p11 = p1->first,
2645 &p12 = p1->second,
2646 &p21 = p2->first,
2647 &p22 = p2->second;
2649 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2650 if a and c have the same basic address snd step, and b and d have the same
2651 address and step. Therefore, if any a&c or b&d don't have the same address
2652 and step, we don't care the order of those two pairs after sorting. */
2653 int comp_res;
2655 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2656 DR_BASE_ADDRESS (p21.dr))) != 0)
2657 return comp_res;
2658 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2659 DR_BASE_ADDRESS (p22.dr))) != 0)
2660 return comp_res;
2661 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2662 return comp_res;
2663 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2664 return comp_res;
2665 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2666 return comp_res;
2667 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2668 return comp_res;
2670 return 0;
2673 template <class T> static void
2674 swap (T& a, T& b)
2676 T c (a);
2677 a = b;
2678 b = c;
2681 /* Function vect_vfa_segment_size.
2683 Create an expression that computes the size of segment
2684 that will be accessed for a data reference. The functions takes into
2685 account that realignment loads may access one more vector.
2687 Input:
2688 DR: The data reference.
2689 LENGTH_FACTOR: segment length to consider.
2691 Return an expression whose value is the size of segment which will be
2692 accessed by DR. */
2694 static tree
2695 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2697 tree segment_length;
2699 if (integer_zerop (DR_STEP (dr)))
2700 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2701 else
2702 segment_length = size_binop (MULT_EXPR,
2703 fold_convert (sizetype, DR_STEP (dr)),
2704 fold_convert (sizetype, length_factor));
2706 if (vect_supportable_dr_alignment (dr, false)
2707 == dr_explicit_realign_optimized)
2709 tree vector_size = TYPE_SIZE_UNIT
2710 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2712 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2714 return segment_length;
2717 /* Function vect_prune_runtime_alias_test_list.
2719 Prune a list of ddrs to be tested at run-time by versioning for alias.
2720 Merge several alias checks into one if possible.
2721 Return FALSE if resulting list of ddrs is longer then allowed by
2722 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2724 bool
2725 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2727 vec<ddr_p> may_alias_ddrs =
2728 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2729 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2730 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2731 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2732 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2734 ddr_p ddr;
2735 unsigned int i;
2736 tree length_factor;
2738 if (dump_enabled_p ())
2739 dump_printf_loc (MSG_NOTE, vect_location,
2740 "=== vect_prune_runtime_alias_test_list ===\n");
2742 if (may_alias_ddrs.is_empty ())
2743 return true;
2745 /* Basically, for each pair of dependent data refs store_ptr_0
2746 and load_ptr_0, we create an expression:
2748 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2749 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2751 for aliasing checks. However, in some cases we can decrease
2752 the number of checks by combining two checks into one. For
2753 example, suppose we have another pair of data refs store_ptr_0
2754 and load_ptr_1, and if the following condition is satisfied:
2756 load_ptr_0 < load_ptr_1 &&
2757 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2759 (this condition means, in each iteration of vectorized loop,
2760 the accessed memory of store_ptr_0 cannot be between the memory
2761 of load_ptr_0 and load_ptr_1.)
2763 we then can use only the following expression to finish the
2764 alising checks between store_ptr_0 & load_ptr_0 and
2765 store_ptr_0 & load_ptr_1:
2767 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2768 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2770 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2771 same basic address. */
2773 comp_alias_ddrs.create (may_alias_ddrs.length ());
2775 /* First, we collect all data ref pairs for aliasing checks. */
2776 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2778 struct data_reference *dr_a, *dr_b;
2779 gimple dr_group_first_a, dr_group_first_b;
2780 tree segment_length_a, segment_length_b;
2781 gimple stmt_a, stmt_b;
2783 dr_a = DDR_A (ddr);
2784 stmt_a = DR_STMT (DDR_A (ddr));
2785 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2786 if (dr_group_first_a)
2788 stmt_a = dr_group_first_a;
2789 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2792 dr_b = DDR_B (ddr);
2793 stmt_b = DR_STMT (DDR_B (ddr));
2794 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2795 if (dr_group_first_b)
2797 stmt_b = dr_group_first_b;
2798 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2801 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2802 length_factor = scalar_loop_iters;
2803 else
2804 length_factor = size_int (vect_factor);
2805 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2806 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2808 dr_with_seg_len_pair_t dr_with_seg_len_pair
2809 (dr_with_seg_len (dr_a, segment_length_a),
2810 dr_with_seg_len (dr_b, segment_length_b));
2812 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2813 swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2815 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2818 /* Second, we sort the collected data ref pairs so that we can scan
2819 them once to combine all possible aliasing checks. */
2820 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2822 /* Third, we scan the sorted dr pairs and check if we can combine
2823 alias checks of two neighbouring dr pairs. */
2824 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2826 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2827 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2828 *dr_b1 = &comp_alias_ddrs[i-1].second,
2829 *dr_a2 = &comp_alias_ddrs[i].first,
2830 *dr_b2 = &comp_alias_ddrs[i].second;
2832 /* Remove duplicate data ref pairs. */
2833 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2835 if (dump_enabled_p ())
2837 dump_printf_loc (MSG_NOTE, vect_location,
2838 "found equal ranges ");
2839 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2840 DR_REF (dr_a1->dr));
2841 dump_printf (MSG_NOTE, ", ");
2842 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2843 DR_REF (dr_b1->dr));
2844 dump_printf (MSG_NOTE, " and ");
2845 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2846 DR_REF (dr_a2->dr));
2847 dump_printf (MSG_NOTE, ", ");
2848 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2849 DR_REF (dr_b2->dr));
2850 dump_printf (MSG_NOTE, "\n");
2853 comp_alias_ddrs.ordered_remove (i--);
2854 continue;
2857 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2859 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2860 and DR_A1 and DR_A2 are two consecutive memrefs. */
2861 if (*dr_a1 == *dr_a2)
2863 swap (dr_a1, dr_b1);
2864 swap (dr_a2, dr_b2);
2867 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2868 DR_BASE_ADDRESS (dr_a2->dr),
2870 || !tree_fits_shwi_p (dr_a1->offset)
2871 || !tree_fits_shwi_p (dr_a2->offset))
2872 continue;
2874 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2875 - tree_to_shwi (dr_a1->offset));
2878 /* Now we check if the following condition is satisfied:
2880 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2882 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2883 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2884 have to make a best estimation. We can get the minimum value
2885 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2886 then either of the following two conditions can guarantee the
2887 one above:
2889 1: DIFF <= MIN_SEG_LEN_B
2890 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2894 HOST_WIDE_INT
2895 min_seg_len_b = (TREE_CODE (dr_b1->seg_len) == INTEGER_CST) ?
2896 TREE_INT_CST_LOW (dr_b1->seg_len) :
2897 vect_factor;
2899 if (diff <= min_seg_len_b
2900 || (TREE_CODE (dr_a1->seg_len) == INTEGER_CST
2901 && diff - (HOST_WIDE_INT) TREE_INT_CST_LOW (dr_a1->seg_len) <
2902 min_seg_len_b))
2904 if (dump_enabled_p ())
2906 dump_printf_loc (MSG_NOTE, vect_location,
2907 "merging ranges for ");
2908 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2909 DR_REF (dr_a1->dr));
2910 dump_printf (MSG_NOTE, ", ");
2911 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2912 DR_REF (dr_b1->dr));
2913 dump_printf (MSG_NOTE, " and ");
2914 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2915 DR_REF (dr_a2->dr));
2916 dump_printf (MSG_NOTE, ", ");
2917 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2918 DR_REF (dr_b2->dr));
2919 dump_printf (MSG_NOTE, "\n");
2922 dr_a1->seg_len = size_binop (PLUS_EXPR,
2923 dr_a2->seg_len, size_int (diff));
2924 comp_alias_ddrs.ordered_remove (i--);
2929 dump_printf_loc (MSG_NOTE, vect_location,
2930 "improved number of alias checks from %d to %d\n",
2931 may_alias_ddrs.length (), comp_alias_ddrs.length ());
2932 if ((int) comp_alias_ddrs.length () >
2933 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2934 return false;
2936 return true;
2939 /* Check whether a non-affine read in stmt is suitable for gather load
2940 and if so, return a builtin decl for that operation. */
2942 tree
2943 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2944 tree *offp, int *scalep)
2946 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2947 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2948 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2949 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2950 tree offtype = NULL_TREE;
2951 tree decl, base, off;
2952 enum machine_mode pmode;
2953 int punsignedp, pvolatilep;
2955 base = DR_REF (dr);
2956 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2957 see if we can use the def stmt of the address. */
2958 if (is_gimple_call (stmt)
2959 && gimple_call_internal_p (stmt)
2960 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2961 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2962 && TREE_CODE (base) == MEM_REF
2963 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2964 && integer_zerop (TREE_OPERAND (base, 1))
2965 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2967 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2968 if (is_gimple_assign (def_stmt)
2969 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
2970 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2973 /* The gather builtins need address of the form
2974 loop_invariant + vector * {1, 2, 4, 8}
2976 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2977 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2978 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2979 multiplications and additions in it. To get a vector, we need
2980 a single SSA_NAME that will be defined in the loop and will
2981 contain everything that is not loop invariant and that can be
2982 vectorized. The following code attempts to find such a preexistng
2983 SSA_NAME OFF and put the loop invariants into a tree BASE
2984 that can be gimplified before the loop. */
2985 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
2986 &pmode, &punsignedp, &pvolatilep, false);
2987 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2989 if (TREE_CODE (base) == MEM_REF)
2991 if (!integer_zerop (TREE_OPERAND (base, 1)))
2993 if (off == NULL_TREE)
2995 double_int moff = mem_ref_offset (base);
2996 off = double_int_to_tree (sizetype, moff);
2998 else
2999 off = size_binop (PLUS_EXPR, off,
3000 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3002 base = TREE_OPERAND (base, 0);
3004 else
3005 base = build_fold_addr_expr (base);
3007 if (off == NULL_TREE)
3008 off = size_zero_node;
3010 /* If base is not loop invariant, either off is 0, then we start with just
3011 the constant offset in the loop invariant BASE and continue with base
3012 as OFF, otherwise give up.
3013 We could handle that case by gimplifying the addition of base + off
3014 into some SSA_NAME and use that as off, but for now punt. */
3015 if (!expr_invariant_in_loop_p (loop, base))
3017 if (!integer_zerop (off))
3018 return NULL_TREE;
3019 off = base;
3020 base = size_int (pbitpos / BITS_PER_UNIT);
3022 /* Otherwise put base + constant offset into the loop invariant BASE
3023 and continue with OFF. */
3024 else
3026 base = fold_convert (sizetype, base);
3027 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3030 /* OFF at this point may be either a SSA_NAME or some tree expression
3031 from get_inner_reference. Try to peel off loop invariants from it
3032 into BASE as long as possible. */
3033 STRIP_NOPS (off);
3034 while (offtype == NULL_TREE)
3036 enum tree_code code;
3037 tree op0, op1, add = NULL_TREE;
3039 if (TREE_CODE (off) == SSA_NAME)
3041 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3043 if (expr_invariant_in_loop_p (loop, off))
3044 return NULL_TREE;
3046 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3047 break;
3049 op0 = gimple_assign_rhs1 (def_stmt);
3050 code = gimple_assign_rhs_code (def_stmt);
3051 op1 = gimple_assign_rhs2 (def_stmt);
3053 else
3055 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3056 return NULL_TREE;
3057 code = TREE_CODE (off);
3058 extract_ops_from_tree (off, &code, &op0, &op1);
3060 switch (code)
3062 case POINTER_PLUS_EXPR:
3063 case PLUS_EXPR:
3064 if (expr_invariant_in_loop_p (loop, op0))
3066 add = op0;
3067 off = op1;
3068 do_add:
3069 add = fold_convert (sizetype, add);
3070 if (scale != 1)
3071 add = size_binop (MULT_EXPR, add, size_int (scale));
3072 base = size_binop (PLUS_EXPR, base, add);
3073 continue;
3075 if (expr_invariant_in_loop_p (loop, op1))
3077 add = op1;
3078 off = op0;
3079 goto do_add;
3081 break;
3082 case MINUS_EXPR:
3083 if (expr_invariant_in_loop_p (loop, op1))
3085 add = fold_convert (sizetype, op1);
3086 add = size_binop (MINUS_EXPR, size_zero_node, add);
3087 off = op0;
3088 goto do_add;
3090 break;
3091 case MULT_EXPR:
3092 if (scale == 1 && tree_fits_shwi_p (op1))
3094 scale = tree_to_shwi (op1);
3095 off = op0;
3096 continue;
3098 break;
3099 case SSA_NAME:
3100 off = op0;
3101 continue;
3102 CASE_CONVERT:
3103 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3104 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3105 break;
3106 if (TYPE_PRECISION (TREE_TYPE (op0))
3107 == TYPE_PRECISION (TREE_TYPE (off)))
3109 off = op0;
3110 continue;
3112 if (TYPE_PRECISION (TREE_TYPE (op0))
3113 < TYPE_PRECISION (TREE_TYPE (off)))
3115 off = op0;
3116 offtype = TREE_TYPE (off);
3117 STRIP_NOPS (off);
3118 continue;
3120 break;
3121 default:
3122 break;
3124 break;
3127 /* If at the end OFF still isn't a SSA_NAME or isn't
3128 defined in the loop, punt. */
3129 if (TREE_CODE (off) != SSA_NAME
3130 || expr_invariant_in_loop_p (loop, off))
3131 return NULL_TREE;
3133 if (offtype == NULL_TREE)
3134 offtype = TREE_TYPE (off);
3136 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3137 offtype, scale);
3138 if (decl == NULL_TREE)
3139 return NULL_TREE;
3141 if (basep)
3142 *basep = base;
3143 if (offp)
3144 *offp = off;
3145 if (scalep)
3146 *scalep = scale;
3147 return decl;
3150 /* Function vect_analyze_data_refs.
3152 Find all the data references in the loop or basic block.
3154 The general structure of the analysis of data refs in the vectorizer is as
3155 follows:
3156 1- vect_analyze_data_refs(loop/bb): call
3157 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3158 in the loop/bb and their dependences.
3159 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3160 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3161 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3165 bool
3166 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3167 bb_vec_info bb_vinfo,
3168 int *min_vf)
3170 struct loop *loop = NULL;
3171 basic_block bb = NULL;
3172 unsigned int i;
3173 vec<data_reference_p> datarefs;
3174 struct data_reference *dr;
3175 tree scalar_type;
3177 if (dump_enabled_p ())
3178 dump_printf_loc (MSG_NOTE, vect_location,
3179 "=== vect_analyze_data_refs ===\n");
3181 if (loop_vinfo)
3183 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3185 loop = LOOP_VINFO_LOOP (loop_vinfo);
3186 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3187 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3189 if (dump_enabled_p ())
3190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3191 "not vectorized: loop contains function calls"
3192 " or data references that cannot be analyzed\n");
3193 return false;
3196 for (i = 0; i < loop->num_nodes; i++)
3198 gimple_stmt_iterator gsi;
3200 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3202 gimple stmt = gsi_stmt (gsi);
3203 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3205 if (is_gimple_call (stmt) && loop->safelen)
3207 tree fndecl = gimple_call_fndecl (stmt), op;
3208 if (fndecl != NULL_TREE)
3210 struct cgraph_node *node = cgraph_get_node (fndecl);
3211 if (node != NULL && node->simd_clones != NULL)
3213 unsigned int j, n = gimple_call_num_args (stmt);
3214 for (j = 0; j < n; j++)
3216 op = gimple_call_arg (stmt, j);
3217 if (DECL_P (op)
3218 || (REFERENCE_CLASS_P (op)
3219 && get_base_address (op)))
3220 break;
3222 op = gimple_call_lhs (stmt);
3223 /* Ignore #pragma omp declare simd functions
3224 if they don't have data references in the
3225 call stmt itself. */
3226 if (j == n
3227 && !(op
3228 && (DECL_P (op)
3229 || (REFERENCE_CLASS_P (op)
3230 && get_base_address (op)))))
3231 continue;
3235 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3236 if (dump_enabled_p ())
3237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3238 "not vectorized: loop contains function "
3239 "calls or data references that cannot "
3240 "be analyzed\n");
3241 return false;
3246 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3248 else
3250 gimple_stmt_iterator gsi;
3252 bb = BB_VINFO_BB (bb_vinfo);
3253 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3255 gimple stmt = gsi_stmt (gsi);
3256 if (!find_data_references_in_stmt (NULL, stmt,
3257 &BB_VINFO_DATAREFS (bb_vinfo)))
3259 /* Mark the rest of the basic-block as unvectorizable. */
3260 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3262 stmt = gsi_stmt (gsi);
3263 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3265 break;
3269 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3272 /* Go through the data-refs, check that the analysis succeeded. Update
3273 pointer from stmt_vec_info struct to DR and vectype. */
3275 FOR_EACH_VEC_ELT (datarefs, i, dr)
3277 gimple stmt;
3278 stmt_vec_info stmt_info;
3279 tree base, offset, init;
3280 bool gather = false;
3281 bool simd_lane_access = false;
3282 int vf;
3284 again:
3285 if (!dr || !DR_REF (dr))
3287 if (dump_enabled_p ())
3288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3289 "not vectorized: unhandled data-ref\n");
3290 return false;
3293 stmt = DR_STMT (dr);
3294 stmt_info = vinfo_for_stmt (stmt);
3296 /* Discard clobbers from the dataref vector. We will remove
3297 clobber stmts during vectorization. */
3298 if (gimple_clobber_p (stmt))
3300 free_data_ref (dr);
3301 if (i == datarefs.length () - 1)
3303 datarefs.pop ();
3304 break;
3306 datarefs.ordered_remove (i);
3307 dr = datarefs[i];
3308 goto again;
3311 /* Check that analysis of the data-ref succeeded. */
3312 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3313 || !DR_STEP (dr))
3315 bool maybe_gather
3316 = DR_IS_READ (dr)
3317 && !TREE_THIS_VOLATILE (DR_REF (dr))
3318 && targetm.vectorize.builtin_gather != NULL;
3319 bool maybe_simd_lane_access
3320 = loop_vinfo && loop->simduid;
3322 /* If target supports vector gather loads, or if this might be
3323 a SIMD lane access, see if they can't be used. */
3324 if (loop_vinfo
3325 && (maybe_gather || maybe_simd_lane_access)
3326 && !nested_in_vect_loop_p (loop, stmt))
3328 struct data_reference *newdr
3329 = create_data_ref (NULL, loop_containing_stmt (stmt),
3330 DR_REF (dr), stmt, true);
3331 gcc_assert (newdr != NULL && DR_REF (newdr));
3332 if (DR_BASE_ADDRESS (newdr)
3333 && DR_OFFSET (newdr)
3334 && DR_INIT (newdr)
3335 && DR_STEP (newdr)
3336 && integer_zerop (DR_STEP (newdr)))
3338 if (maybe_simd_lane_access)
3340 tree off = DR_OFFSET (newdr);
3341 STRIP_NOPS (off);
3342 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3343 && TREE_CODE (off) == MULT_EXPR
3344 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3346 tree step = TREE_OPERAND (off, 1);
3347 off = TREE_OPERAND (off, 0);
3348 STRIP_NOPS (off);
3349 if (CONVERT_EXPR_P (off)
3350 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3351 0)))
3352 < TYPE_PRECISION (TREE_TYPE (off)))
3353 off = TREE_OPERAND (off, 0);
3354 if (TREE_CODE (off) == SSA_NAME)
3356 gimple def = SSA_NAME_DEF_STMT (off);
3357 tree reft = TREE_TYPE (DR_REF (newdr));
3358 if (is_gimple_call (def)
3359 && gimple_call_internal_p (def)
3360 && (gimple_call_internal_fn (def)
3361 == IFN_GOMP_SIMD_LANE))
3363 tree arg = gimple_call_arg (def, 0);
3364 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3365 arg = SSA_NAME_VAR (arg);
3366 if (arg == loop->simduid
3367 /* For now. */
3368 && tree_int_cst_equal
3369 (TYPE_SIZE_UNIT (reft),
3370 step))
3372 DR_OFFSET (newdr) = ssize_int (0);
3373 DR_STEP (newdr) = step;
3374 DR_ALIGNED_TO (newdr)
3375 = size_int (BIGGEST_ALIGNMENT);
3376 dr = newdr;
3377 simd_lane_access = true;
3383 if (!simd_lane_access && maybe_gather)
3385 dr = newdr;
3386 gather = true;
3389 if (!gather && !simd_lane_access)
3390 free_data_ref (newdr);
3393 if (!gather && !simd_lane_access)
3395 if (dump_enabled_p ())
3397 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3398 "not vectorized: data ref analysis "
3399 "failed ");
3400 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3401 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3404 if (bb_vinfo)
3405 break;
3407 return false;
3411 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3413 if (dump_enabled_p ())
3414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3415 "not vectorized: base addr of dr is a "
3416 "constant\n");
3418 if (bb_vinfo)
3419 break;
3421 if (gather || simd_lane_access)
3422 free_data_ref (dr);
3423 return false;
3426 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3428 if (dump_enabled_p ())
3430 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3431 "not vectorized: volatile type ");
3432 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3433 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3436 if (bb_vinfo)
3437 break;
3439 return false;
3442 if (stmt_can_throw_internal (stmt))
3444 if (dump_enabled_p ())
3446 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3447 "not vectorized: statement can throw an "
3448 "exception ");
3449 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3450 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3453 if (bb_vinfo)
3454 break;
3456 if (gather || simd_lane_access)
3457 free_data_ref (dr);
3458 return false;
3461 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3462 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3464 if (dump_enabled_p ())
3466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3467 "not vectorized: statement is bitfield "
3468 "access ");
3469 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3470 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3473 if (bb_vinfo)
3474 break;
3476 if (gather || simd_lane_access)
3477 free_data_ref (dr);
3478 return false;
3481 base = unshare_expr (DR_BASE_ADDRESS (dr));
3482 offset = unshare_expr (DR_OFFSET (dr));
3483 init = unshare_expr (DR_INIT (dr));
3485 if (is_gimple_call (stmt)
3486 && (!gimple_call_internal_p (stmt)
3487 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3488 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3490 if (dump_enabled_p ())
3492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3493 "not vectorized: dr in a call ");
3494 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3495 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3498 if (bb_vinfo)
3499 break;
3501 if (gather || simd_lane_access)
3502 free_data_ref (dr);
3503 return false;
3506 /* Update DR field in stmt_vec_info struct. */
3508 /* If the dataref is in an inner-loop of the loop that is considered for
3509 for vectorization, we also want to analyze the access relative to
3510 the outer-loop (DR contains information only relative to the
3511 inner-most enclosing loop). We do that by building a reference to the
3512 first location accessed by the inner-loop, and analyze it relative to
3513 the outer-loop. */
3514 if (loop && nested_in_vect_loop_p (loop, stmt))
3516 tree outer_step, outer_base, outer_init;
3517 HOST_WIDE_INT pbitsize, pbitpos;
3518 tree poffset;
3519 enum machine_mode pmode;
3520 int punsignedp, pvolatilep;
3521 affine_iv base_iv, offset_iv;
3522 tree dinit;
3524 /* Build a reference to the first location accessed by the
3525 inner-loop: *(BASE+INIT). (The first location is actually
3526 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3527 tree inner_base = build_fold_indirect_ref
3528 (fold_build_pointer_plus (base, init));
3530 if (dump_enabled_p ())
3532 dump_printf_loc (MSG_NOTE, vect_location,
3533 "analyze in outer-loop: ");
3534 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3535 dump_printf (MSG_NOTE, "\n");
3538 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3539 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3540 gcc_assert (outer_base != NULL_TREE);
3542 if (pbitpos % BITS_PER_UNIT != 0)
3544 if (dump_enabled_p ())
3545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3546 "failed: bit offset alignment.\n");
3547 return false;
3550 outer_base = build_fold_addr_expr (outer_base);
3551 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3552 &base_iv, false))
3554 if (dump_enabled_p ())
3555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3556 "failed: evolution of base is not affine.\n");
3557 return false;
3560 if (offset)
3562 if (poffset)
3563 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3564 poffset);
3565 else
3566 poffset = offset;
3569 if (!poffset)
3571 offset_iv.base = ssize_int (0);
3572 offset_iv.step = ssize_int (0);
3574 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3575 &offset_iv, false))
3577 if (dump_enabled_p ())
3578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3579 "evolution of offset is not affine.\n");
3580 return false;
3583 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3584 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3585 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3586 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3587 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3589 outer_step = size_binop (PLUS_EXPR,
3590 fold_convert (ssizetype, base_iv.step),
3591 fold_convert (ssizetype, offset_iv.step));
3593 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3594 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3595 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3596 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3597 STMT_VINFO_DR_OFFSET (stmt_info) =
3598 fold_convert (ssizetype, offset_iv.base);
3599 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3600 size_int (highest_pow2_factor (offset_iv.base));
3602 if (dump_enabled_p ())
3604 dump_printf_loc (MSG_NOTE, vect_location,
3605 "\touter base_address: ");
3606 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3607 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3608 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3609 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3610 STMT_VINFO_DR_OFFSET (stmt_info));
3611 dump_printf (MSG_NOTE,
3612 "\n\touter constant offset from base address: ");
3613 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3614 STMT_VINFO_DR_INIT (stmt_info));
3615 dump_printf (MSG_NOTE, "\n\touter step: ");
3616 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3617 STMT_VINFO_DR_STEP (stmt_info));
3618 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3619 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3620 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3621 dump_printf (MSG_NOTE, "\n");
3625 if (STMT_VINFO_DATA_REF (stmt_info))
3627 if (dump_enabled_p ())
3629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3630 "not vectorized: more than one data ref "
3631 "in stmt: ");
3632 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3633 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3636 if (bb_vinfo)
3637 break;
3639 if (gather || simd_lane_access)
3640 free_data_ref (dr);
3641 return false;
3644 STMT_VINFO_DATA_REF (stmt_info) = dr;
3645 if (simd_lane_access)
3647 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3648 free_data_ref (datarefs[i]);
3649 datarefs[i] = dr;
3652 /* Set vectype for STMT. */
3653 scalar_type = TREE_TYPE (DR_REF (dr));
3654 STMT_VINFO_VECTYPE (stmt_info)
3655 = get_vectype_for_scalar_type (scalar_type);
3656 if (!STMT_VINFO_VECTYPE (stmt_info))
3658 if (dump_enabled_p ())
3660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3661 "not vectorized: no vectype for stmt: ");
3662 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3663 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3664 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3665 scalar_type);
3666 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3669 if (bb_vinfo)
3670 break;
3672 if (gather || simd_lane_access)
3674 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3675 if (gather)
3676 free_data_ref (dr);
3678 return false;
3680 else
3682 if (dump_enabled_p ())
3684 dump_printf_loc (MSG_NOTE, vect_location,
3685 "got vectype for stmt: ");
3686 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3687 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3688 STMT_VINFO_VECTYPE (stmt_info));
3689 dump_printf (MSG_NOTE, "\n");
3693 /* Adjust the minimal vectorization factor according to the
3694 vector type. */
3695 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3696 if (vf > *min_vf)
3697 *min_vf = vf;
3699 if (gather)
3701 tree off;
3703 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3704 if (gather
3705 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3706 gather = false;
3707 if (!gather)
3709 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3710 free_data_ref (dr);
3711 if (dump_enabled_p ())
3713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3714 "not vectorized: not suitable for gather "
3715 "load ");
3716 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3717 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3719 return false;
3722 datarefs[i] = dr;
3723 STMT_VINFO_GATHER_P (stmt_info) = true;
3725 else if (loop_vinfo
3726 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3728 if (nested_in_vect_loop_p (loop, stmt)
3729 || !DR_IS_READ (dr))
3731 if (dump_enabled_p ())
3733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3734 "not vectorized: not suitable for strided "
3735 "load ");
3736 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3737 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3739 return false;
3741 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3745 /* If we stopped analysis at the first dataref we could not analyze
3746 when trying to vectorize a basic-block mark the rest of the datarefs
3747 as not vectorizable and truncate the vector of datarefs. That
3748 avoids spending useless time in analyzing their dependence. */
3749 if (i != datarefs.length ())
3751 gcc_assert (bb_vinfo != NULL);
3752 for (unsigned j = i; j < datarefs.length (); ++j)
3754 data_reference_p dr = datarefs[j];
3755 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3756 free_data_ref (dr);
3758 datarefs.truncate (i);
3761 return true;
3765 /* Function vect_get_new_vect_var.
3767 Returns a name for a new variable. The current naming scheme appends the
3768 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3769 the name of vectorizer generated variables, and appends that to NAME if
3770 provided. */
3772 tree
3773 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3775 const char *prefix;
3776 tree new_vect_var;
3778 switch (var_kind)
3780 case vect_simple_var:
3781 prefix = "vect";
3782 break;
3783 case vect_scalar_var:
3784 prefix = "stmp";
3785 break;
3786 case vect_pointer_var:
3787 prefix = "vectp";
3788 break;
3789 default:
3790 gcc_unreachable ();
3793 if (name)
3795 char* tmp = concat (prefix, "_", name, NULL);
3796 new_vect_var = create_tmp_reg (type, tmp);
3797 free (tmp);
3799 else
3800 new_vect_var = create_tmp_reg (type, prefix);
3802 return new_vect_var;
3806 /* Function vect_create_addr_base_for_vector_ref.
3808 Create an expression that computes the address of the first memory location
3809 that will be accessed for a data reference.
3811 Input:
3812 STMT: The statement containing the data reference.
3813 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3814 OFFSET: Optional. If supplied, it is be added to the initial address.
3815 LOOP: Specify relative to which loop-nest should the address be computed.
3816 For example, when the dataref is in an inner-loop nested in an
3817 outer-loop that is now being vectorized, LOOP can be either the
3818 outer-loop, or the inner-loop. The first memory location accessed
3819 by the following dataref ('in' points to short):
3821 for (i=0; i<N; i++)
3822 for (j=0; j<M; j++)
3823 s += in[i+j]
3825 is as follows:
3826 if LOOP=i_loop: &in (relative to i_loop)
3827 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3829 Output:
3830 1. Return an SSA_NAME whose value is the address of the memory location of
3831 the first vector of the data reference.
3832 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3833 these statement(s) which define the returned SSA_NAME.
3835 FORNOW: We are only handling array accesses with step 1. */
3837 tree
3838 vect_create_addr_base_for_vector_ref (gimple stmt,
3839 gimple_seq *new_stmt_list,
3840 tree offset,
3841 struct loop *loop)
3843 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3844 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3845 tree data_ref_base;
3846 const char *base_name;
3847 tree addr_base;
3848 tree dest;
3849 gimple_seq seq = NULL;
3850 tree base_offset;
3851 tree init;
3852 tree vect_ptr_type;
3853 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3854 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3856 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3858 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3860 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3862 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3863 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3864 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3866 else
3868 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3869 base_offset = unshare_expr (DR_OFFSET (dr));
3870 init = unshare_expr (DR_INIT (dr));
3873 if (loop_vinfo)
3874 base_name = get_name (data_ref_base);
3875 else
3877 base_offset = ssize_int (0);
3878 init = ssize_int (0);
3879 base_name = get_name (DR_REF (dr));
3882 /* Create base_offset */
3883 base_offset = size_binop (PLUS_EXPR,
3884 fold_convert (sizetype, base_offset),
3885 fold_convert (sizetype, init));
3887 if (offset)
3889 offset = fold_build2 (MULT_EXPR, sizetype,
3890 fold_convert (sizetype, offset), step);
3891 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3892 base_offset, offset);
3895 /* base + base_offset */
3896 if (loop_vinfo)
3897 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3898 else
3900 addr_base = build1 (ADDR_EXPR,
3901 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3902 unshare_expr (DR_REF (dr)));
3905 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3906 addr_base = fold_convert (vect_ptr_type, addr_base);
3907 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3908 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3909 gimple_seq_add_seq (new_stmt_list, seq);
3911 if (DR_PTR_INFO (dr)
3912 && TREE_CODE (addr_base) == SSA_NAME)
3914 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3915 if (offset)
3916 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3919 if (dump_enabled_p ())
3921 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3922 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3923 dump_printf (MSG_NOTE, "\n");
3926 return addr_base;
3930 /* Function vect_create_data_ref_ptr.
3932 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3933 location accessed in the loop by STMT, along with the def-use update
3934 chain to appropriately advance the pointer through the loop iterations.
3935 Also set aliasing information for the pointer. This pointer is used by
3936 the callers to this function to create a memory reference expression for
3937 vector load/store access.
3939 Input:
3940 1. STMT: a stmt that references memory. Expected to be of the form
3941 GIMPLE_ASSIGN <name, data-ref> or
3942 GIMPLE_ASSIGN <data-ref, name>.
3943 2. AGGR_TYPE: the type of the reference, which should be either a vector
3944 or an array.
3945 3. AT_LOOP: the loop where the vector memref is to be created.
3946 4. OFFSET (optional): an offset to be added to the initial address accessed
3947 by the data-ref in STMT.
3948 5. BSI: location where the new stmts are to be placed if there is no loop
3949 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3950 pointing to the initial address.
3952 Output:
3953 1. Declare a new ptr to vector_type, and have it point to the base of the
3954 data reference (initial addressed accessed by the data reference).
3955 For example, for vector of type V8HI, the following code is generated:
3957 v8hi *ap;
3958 ap = (v8hi *)initial_address;
3960 if OFFSET is not supplied:
3961 initial_address = &a[init];
3962 if OFFSET is supplied:
3963 initial_address = &a[init + OFFSET];
3965 Return the initial_address in INITIAL_ADDRESS.
3967 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3968 update the pointer in each iteration of the loop.
3970 Return the increment stmt that updates the pointer in PTR_INCR.
3972 3. Set INV_P to true if the access pattern of the data reference in the
3973 vectorized loop is invariant. Set it to false otherwise.
3975 4. Return the pointer. */
3977 tree
3978 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3979 tree offset, tree *initial_address,
3980 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3981 bool only_init, bool *inv_p)
3983 const char *base_name;
3984 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3985 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3986 struct loop *loop = NULL;
3987 bool nested_in_vect_loop = false;
3988 struct loop *containing_loop = NULL;
3989 tree aggr_ptr_type;
3990 tree aggr_ptr;
3991 tree new_temp;
3992 gimple vec_stmt;
3993 gimple_seq new_stmt_list = NULL;
3994 edge pe = NULL;
3995 basic_block new_bb;
3996 tree aggr_ptr_init;
3997 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3998 tree aptr;
3999 gimple_stmt_iterator incr_gsi;
4000 bool insert_after;
4001 tree indx_before_incr, indx_after_incr;
4002 gimple incr;
4003 tree step;
4004 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4006 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4007 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4009 if (loop_vinfo)
4011 loop = LOOP_VINFO_LOOP (loop_vinfo);
4012 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4013 containing_loop = (gimple_bb (stmt))->loop_father;
4014 pe = loop_preheader_edge (loop);
4016 else
4018 gcc_assert (bb_vinfo);
4019 only_init = true;
4020 *ptr_incr = NULL;
4023 /* Check the step (evolution) of the load in LOOP, and record
4024 whether it's invariant. */
4025 if (nested_in_vect_loop)
4026 step = STMT_VINFO_DR_STEP (stmt_info);
4027 else
4028 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4030 if (integer_zerop (step))
4031 *inv_p = true;
4032 else
4033 *inv_p = false;
4035 /* Create an expression for the first address accessed by this load
4036 in LOOP. */
4037 base_name = get_name (DR_BASE_ADDRESS (dr));
4039 if (dump_enabled_p ())
4041 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4042 dump_printf_loc (MSG_NOTE, vect_location,
4043 "create %s-pointer variable to type: ",
4044 get_tree_code_name (TREE_CODE (aggr_type)));
4045 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4046 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4047 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4048 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4049 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4050 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4051 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4052 else
4053 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4054 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4055 dump_printf (MSG_NOTE, "\n");
4058 /* (1) Create the new aggregate-pointer variable.
4059 Vector and array types inherit the alias set of their component
4060 type by default so we need to use a ref-all pointer if the data
4061 reference does not conflict with the created aggregated data
4062 reference because it is not addressable. */
4063 bool need_ref_all = false;
4064 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4065 get_alias_set (DR_REF (dr))))
4066 need_ref_all = true;
4067 /* Likewise for any of the data references in the stmt group. */
4068 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4070 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4073 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4074 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4075 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4076 get_alias_set (DR_REF (sdr))))
4078 need_ref_all = true;
4079 break;
4081 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4083 while (orig_stmt);
4085 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4086 need_ref_all);
4087 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4090 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4091 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4092 def-use update cycles for the pointer: one relative to the outer-loop
4093 (LOOP), which is what steps (3) and (4) below do. The other is relative
4094 to the inner-loop (which is the inner-most loop containing the dataref),
4095 and this is done be step (5) below.
4097 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4098 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4099 redundant. Steps (3),(4) create the following:
4101 vp0 = &base_addr;
4102 LOOP: vp1 = phi(vp0,vp2)
4105 vp2 = vp1 + step
4106 goto LOOP
4108 If there is an inner-loop nested in loop, then step (5) will also be
4109 applied, and an additional update in the inner-loop will be created:
4111 vp0 = &base_addr;
4112 LOOP: vp1 = phi(vp0,vp2)
4114 inner: vp3 = phi(vp1,vp4)
4115 vp4 = vp3 + inner_step
4116 if () goto inner
4118 vp2 = vp1 + step
4119 if () goto LOOP */
4121 /* (2) Calculate the initial address of the aggregate-pointer, and set
4122 the aggregate-pointer to point to it before the loop. */
4124 /* Create: (&(base[init_val+offset]) in the loop preheader. */
4126 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4127 offset, loop);
4128 if (new_stmt_list)
4130 if (pe)
4132 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4133 gcc_assert (!new_bb);
4135 else
4136 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4139 *initial_address = new_temp;
4141 /* Create: p = (aggr_type *) initial_base */
4142 if (TREE_CODE (new_temp) != SSA_NAME
4143 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4145 vec_stmt = gimple_build_assign (aggr_ptr,
4146 fold_convert (aggr_ptr_type, new_temp));
4147 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4148 /* Copy the points-to information if it exists. */
4149 if (DR_PTR_INFO (dr))
4150 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4151 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4152 if (pe)
4154 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4155 gcc_assert (!new_bb);
4157 else
4158 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4160 else
4161 aggr_ptr_init = new_temp;
4163 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4164 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4165 inner-loop nested in LOOP (during outer-loop vectorization). */
4167 /* No update in loop is required. */
4168 if (only_init && (!loop_vinfo || at_loop == loop))
4169 aptr = aggr_ptr_init;
4170 else
4172 /* The step of the aggregate pointer is the type size. */
4173 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4174 /* One exception to the above is when the scalar step of the load in
4175 LOOP is zero. In this case the step here is also zero. */
4176 if (*inv_p)
4177 iv_step = size_zero_node;
4178 else if (tree_int_cst_sgn (step) == -1)
4179 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4181 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4183 create_iv (aggr_ptr_init,
4184 fold_convert (aggr_ptr_type, iv_step),
4185 aggr_ptr, loop, &incr_gsi, insert_after,
4186 &indx_before_incr, &indx_after_incr);
4187 incr = gsi_stmt (incr_gsi);
4188 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4190 /* Copy the points-to information if it exists. */
4191 if (DR_PTR_INFO (dr))
4193 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4194 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4196 if (ptr_incr)
4197 *ptr_incr = incr;
4199 aptr = indx_before_incr;
4202 if (!nested_in_vect_loop || only_init)
4203 return aptr;
4206 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4207 nested in LOOP, if exists. */
4209 gcc_assert (nested_in_vect_loop);
4210 if (!only_init)
4212 standard_iv_increment_position (containing_loop, &incr_gsi,
4213 &insert_after);
4214 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4215 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4216 &indx_after_incr);
4217 incr = gsi_stmt (incr_gsi);
4218 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4220 /* Copy the points-to information if it exists. */
4221 if (DR_PTR_INFO (dr))
4223 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4224 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4226 if (ptr_incr)
4227 *ptr_incr = incr;
4229 return indx_before_incr;
4231 else
4232 gcc_unreachable ();
4236 /* Function bump_vector_ptr
4238 Increment a pointer (to a vector type) by vector-size. If requested,
4239 i.e. if PTR-INCR is given, then also connect the new increment stmt
4240 to the existing def-use update-chain of the pointer, by modifying
4241 the PTR_INCR as illustrated below:
4243 The pointer def-use update-chain before this function:
4244 DATAREF_PTR = phi (p_0, p_2)
4245 ....
4246 PTR_INCR: p_2 = DATAREF_PTR + step
4248 The pointer def-use update-chain after this function:
4249 DATAREF_PTR = phi (p_0, p_2)
4250 ....
4251 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4252 ....
4253 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4255 Input:
4256 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4257 in the loop.
4258 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4259 the loop. The increment amount across iterations is expected
4260 to be vector_size.
4261 BSI - location where the new update stmt is to be placed.
4262 STMT - the original scalar memory-access stmt that is being vectorized.
4263 BUMP - optional. The offset by which to bump the pointer. If not given,
4264 the offset is assumed to be vector_size.
4266 Output: Return NEW_DATAREF_PTR as illustrated above.
4270 tree
4271 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4272 gimple stmt, tree bump)
4274 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4275 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4276 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4277 tree update = TYPE_SIZE_UNIT (vectype);
4278 gimple incr_stmt;
4279 ssa_op_iter iter;
4280 use_operand_p use_p;
4281 tree new_dataref_ptr;
4283 if (bump)
4284 update = bump;
4286 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4287 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4288 dataref_ptr, update);
4289 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4291 /* Copy the points-to information if it exists. */
4292 if (DR_PTR_INFO (dr))
4294 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4295 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4298 if (!ptr_incr)
4299 return new_dataref_ptr;
4301 /* Update the vector-pointer's cross-iteration increment. */
4302 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4304 tree use = USE_FROM_PTR (use_p);
4306 if (use == dataref_ptr)
4307 SET_USE (use_p, new_dataref_ptr);
4308 else
4309 gcc_assert (tree_int_cst_compare (use, update) == 0);
4312 return new_dataref_ptr;
4316 /* Function vect_create_destination_var.
4318 Create a new temporary of type VECTYPE. */
4320 tree
4321 vect_create_destination_var (tree scalar_dest, tree vectype)
4323 tree vec_dest;
4324 const char *name;
4325 char *new_name;
4326 tree type;
4327 enum vect_var_kind kind;
4329 kind = vectype ? vect_simple_var : vect_scalar_var;
4330 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4332 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4334 name = get_name (scalar_dest);
4335 if (name)
4336 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4337 else
4338 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
4339 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4340 free (new_name);
4342 return vec_dest;
4345 /* Function vect_grouped_store_supported.
4347 Returns TRUE if interleave high and interleave low permutations
4348 are supported, and FALSE otherwise. */
4350 bool
4351 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4353 enum machine_mode mode = TYPE_MODE (vectype);
4355 /* vect_permute_store_chain requires the group size to be a power of two. */
4356 if (exact_log2 (count) == -1)
4358 if (dump_enabled_p ())
4359 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4360 "the size of the group of accesses"
4361 " is not a power of 2\n");
4362 return false;
4365 /* Check that the permutation is supported. */
4366 if (VECTOR_MODE_P (mode))
4368 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4369 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4370 for (i = 0; i < nelt / 2; i++)
4372 sel[i * 2] = i;
4373 sel[i * 2 + 1] = i + nelt;
4375 if (can_vec_perm_p (mode, false, sel))
4377 for (i = 0; i < nelt; i++)
4378 sel[i] += nelt / 2;
4379 if (can_vec_perm_p (mode, false, sel))
4380 return true;
4384 if (dump_enabled_p ())
4385 dump_printf (MSG_MISSED_OPTIMIZATION,
4386 "interleave op not supported by target.\n");
4387 return false;
4391 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4392 type VECTYPE. */
4394 bool
4395 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4397 return vect_lanes_optab_supported_p ("vec_store_lanes",
4398 vec_store_lanes_optab,
4399 vectype, count);
4403 /* Function vect_permute_store_chain.
4405 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4406 a power of 2, generate interleave_high/low stmts to reorder the data
4407 correctly for the stores. Return the final references for stores in
4408 RESULT_CHAIN.
4410 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4411 The input is 4 vectors each containing 8 elements. We assign a number to
4412 each element, the input sequence is:
4414 1st vec: 0 1 2 3 4 5 6 7
4415 2nd vec: 8 9 10 11 12 13 14 15
4416 3rd vec: 16 17 18 19 20 21 22 23
4417 4th vec: 24 25 26 27 28 29 30 31
4419 The output sequence should be:
4421 1st vec: 0 8 16 24 1 9 17 25
4422 2nd vec: 2 10 18 26 3 11 19 27
4423 3rd vec: 4 12 20 28 5 13 21 30
4424 4th vec: 6 14 22 30 7 15 23 31
4426 i.e., we interleave the contents of the four vectors in their order.
4428 We use interleave_high/low instructions to create such output. The input of
4429 each interleave_high/low operation is two vectors:
4430 1st vec 2nd vec
4431 0 1 2 3 4 5 6 7
4432 the even elements of the result vector are obtained left-to-right from the
4433 high/low elements of the first vector. The odd elements of the result are
4434 obtained left-to-right from the high/low elements of the second vector.
4435 The output of interleave_high will be: 0 4 1 5
4436 and of interleave_low: 2 6 3 7
4439 The permutation is done in log LENGTH stages. In each stage interleave_high
4440 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4441 where the first argument is taken from the first half of DR_CHAIN and the
4442 second argument from it's second half.
4443 In our example,
4445 I1: interleave_high (1st vec, 3rd vec)
4446 I2: interleave_low (1st vec, 3rd vec)
4447 I3: interleave_high (2nd vec, 4th vec)
4448 I4: interleave_low (2nd vec, 4th vec)
4450 The output for the first stage is:
4452 I1: 0 16 1 17 2 18 3 19
4453 I2: 4 20 5 21 6 22 7 23
4454 I3: 8 24 9 25 10 26 11 27
4455 I4: 12 28 13 29 14 30 15 31
4457 The output of the second stage, i.e. the final result is:
4459 I1: 0 8 16 24 1 9 17 25
4460 I2: 2 10 18 26 3 11 19 27
4461 I3: 4 12 20 28 5 13 21 30
4462 I4: 6 14 22 30 7 15 23 31. */
4464 void
4465 vect_permute_store_chain (vec<tree> dr_chain,
4466 unsigned int length,
4467 gimple stmt,
4468 gimple_stmt_iterator *gsi,
4469 vec<tree> *result_chain)
4471 tree vect1, vect2, high, low;
4472 gimple perm_stmt;
4473 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4474 tree perm_mask_low, perm_mask_high;
4475 unsigned int i, n;
4476 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4477 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4479 result_chain->quick_grow (length);
4480 memcpy (result_chain->address (), dr_chain.address (),
4481 length * sizeof (tree));
4483 for (i = 0, n = nelt / 2; i < n; i++)
4485 sel[i * 2] = i;
4486 sel[i * 2 + 1] = i + nelt;
4488 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4489 gcc_assert (perm_mask_high != NULL);
4491 for (i = 0; i < nelt; i++)
4492 sel[i] += nelt / 2;
4493 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4494 gcc_assert (perm_mask_low != NULL);
4496 for (i = 0, n = exact_log2 (length); i < n; i++)
4498 for (j = 0; j < length/2; j++)
4500 vect1 = dr_chain[j];
4501 vect2 = dr_chain[j+length/2];
4503 /* Create interleaving stmt:
4504 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4505 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4506 perm_stmt
4507 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4508 vect1, vect2, perm_mask_high);
4509 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4510 (*result_chain)[2*j] = high;
4512 /* Create interleaving stmt:
4513 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4514 nelt*3/2+1, ...}> */
4515 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4516 perm_stmt
4517 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4518 vect1, vect2, perm_mask_low);
4519 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4520 (*result_chain)[2*j+1] = low;
4522 memcpy (dr_chain.address (), result_chain->address (),
4523 length * sizeof (tree));
4527 /* Function vect_setup_realignment
4529 This function is called when vectorizing an unaligned load using
4530 the dr_explicit_realign[_optimized] scheme.
4531 This function generates the following code at the loop prolog:
4533 p = initial_addr;
4534 x msq_init = *(floor(p)); # prolog load
4535 realignment_token = call target_builtin;
4536 loop:
4537 x msq = phi (msq_init, ---)
4539 The stmts marked with x are generated only for the case of
4540 dr_explicit_realign_optimized.
4542 The code above sets up a new (vector) pointer, pointing to the first
4543 location accessed by STMT, and a "floor-aligned" load using that pointer.
4544 It also generates code to compute the "realignment-token" (if the relevant
4545 target hook was defined), and creates a phi-node at the loop-header bb
4546 whose arguments are the result of the prolog-load (created by this
4547 function) and the result of a load that takes place in the loop (to be
4548 created by the caller to this function).
4550 For the case of dr_explicit_realign_optimized:
4551 The caller to this function uses the phi-result (msq) to create the
4552 realignment code inside the loop, and sets up the missing phi argument,
4553 as follows:
4554 loop:
4555 msq = phi (msq_init, lsq)
4556 lsq = *(floor(p')); # load in loop
4557 result = realign_load (msq, lsq, realignment_token);
4559 For the case of dr_explicit_realign:
4560 loop:
4561 msq = *(floor(p)); # load in loop
4562 p' = p + (VS-1);
4563 lsq = *(floor(p')); # load in loop
4564 result = realign_load (msq, lsq, realignment_token);
4566 Input:
4567 STMT - (scalar) load stmt to be vectorized. This load accesses
4568 a memory location that may be unaligned.
4569 BSI - place where new code is to be inserted.
4570 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4571 is used.
4573 Output:
4574 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4575 target hook, if defined.
4576 Return value - the result of the loop-header phi node. */
4578 tree
4579 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4580 tree *realignment_token,
4581 enum dr_alignment_support alignment_support_scheme,
4582 tree init_addr,
4583 struct loop **at_loop)
4585 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4586 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4587 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4588 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4589 struct loop *loop = NULL;
4590 edge pe = NULL;
4591 tree scalar_dest = gimple_assign_lhs (stmt);
4592 tree vec_dest;
4593 gimple inc;
4594 tree ptr;
4595 tree data_ref;
4596 gimple new_stmt;
4597 basic_block new_bb;
4598 tree msq_init = NULL_TREE;
4599 tree new_temp;
4600 gimple phi_stmt;
4601 tree msq = NULL_TREE;
4602 gimple_seq stmts = NULL;
4603 bool inv_p;
4604 bool compute_in_loop = false;
4605 bool nested_in_vect_loop = false;
4606 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4607 struct loop *loop_for_initial_load = NULL;
4609 if (loop_vinfo)
4611 loop = LOOP_VINFO_LOOP (loop_vinfo);
4612 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4615 gcc_assert (alignment_support_scheme == dr_explicit_realign
4616 || alignment_support_scheme == dr_explicit_realign_optimized);
4618 /* We need to generate three things:
4619 1. the misalignment computation
4620 2. the extra vector load (for the optimized realignment scheme).
4621 3. the phi node for the two vectors from which the realignment is
4622 done (for the optimized realignment scheme). */
4624 /* 1. Determine where to generate the misalignment computation.
4626 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4627 calculation will be generated by this function, outside the loop (in the
4628 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4629 caller, inside the loop.
4631 Background: If the misalignment remains fixed throughout the iterations of
4632 the loop, then both realignment schemes are applicable, and also the
4633 misalignment computation can be done outside LOOP. This is because we are
4634 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4635 are a multiple of VS (the Vector Size), and therefore the misalignment in
4636 different vectorized LOOP iterations is always the same.
4637 The problem arises only if the memory access is in an inner-loop nested
4638 inside LOOP, which is now being vectorized using outer-loop vectorization.
4639 This is the only case when the misalignment of the memory access may not
4640 remain fixed throughout the iterations of the inner-loop (as explained in
4641 detail in vect_supportable_dr_alignment). In this case, not only is the
4642 optimized realignment scheme not applicable, but also the misalignment
4643 computation (and generation of the realignment token that is passed to
4644 REALIGN_LOAD) have to be done inside the loop.
4646 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4647 or not, which in turn determines if the misalignment is computed inside
4648 the inner-loop, or outside LOOP. */
4650 if (init_addr != NULL_TREE || !loop_vinfo)
4652 compute_in_loop = true;
4653 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4657 /* 2. Determine where to generate the extra vector load.
4659 For the optimized realignment scheme, instead of generating two vector
4660 loads in each iteration, we generate a single extra vector load in the
4661 preheader of the loop, and in each iteration reuse the result of the
4662 vector load from the previous iteration. In case the memory access is in
4663 an inner-loop nested inside LOOP, which is now being vectorized using
4664 outer-loop vectorization, we need to determine whether this initial vector
4665 load should be generated at the preheader of the inner-loop, or can be
4666 generated at the preheader of LOOP. If the memory access has no evolution
4667 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4668 to be generated inside LOOP (in the preheader of the inner-loop). */
4670 if (nested_in_vect_loop)
4672 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4673 bool invariant_in_outerloop =
4674 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4675 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4677 else
4678 loop_for_initial_load = loop;
4679 if (at_loop)
4680 *at_loop = loop_for_initial_load;
4682 if (loop_for_initial_load)
4683 pe = loop_preheader_edge (loop_for_initial_load);
4685 /* 3. For the case of the optimized realignment, create the first vector
4686 load at the loop preheader. */
4688 if (alignment_support_scheme == dr_explicit_realign_optimized)
4690 /* Create msq_init = *(floor(p1)) in the loop preheader */
4692 gcc_assert (!compute_in_loop);
4693 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4694 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4695 NULL_TREE, &init_addr, NULL, &inc,
4696 true, &inv_p);
4697 new_temp = copy_ssa_name (ptr, NULL);
4698 new_stmt = gimple_build_assign_with_ops
4699 (BIT_AND_EXPR, new_temp, ptr,
4700 build_int_cst (TREE_TYPE (ptr),
4701 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4702 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4703 gcc_assert (!new_bb);
4704 data_ref
4705 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4706 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4707 new_stmt = gimple_build_assign (vec_dest, data_ref);
4708 new_temp = make_ssa_name (vec_dest, new_stmt);
4709 gimple_assign_set_lhs (new_stmt, new_temp);
4710 if (pe)
4712 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4713 gcc_assert (!new_bb);
4715 else
4716 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4718 msq_init = gimple_assign_lhs (new_stmt);
4721 /* 4. Create realignment token using a target builtin, if available.
4722 It is done either inside the containing loop, or before LOOP (as
4723 determined above). */
4725 if (targetm.vectorize.builtin_mask_for_load)
4727 tree builtin_decl;
4729 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4730 if (!init_addr)
4732 /* Generate the INIT_ADDR computation outside LOOP. */
4733 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4734 NULL_TREE, loop);
4735 if (loop)
4737 pe = loop_preheader_edge (loop);
4738 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4739 gcc_assert (!new_bb);
4741 else
4742 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4745 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4746 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4747 vec_dest =
4748 vect_create_destination_var (scalar_dest,
4749 gimple_call_return_type (new_stmt));
4750 new_temp = make_ssa_name (vec_dest, new_stmt);
4751 gimple_call_set_lhs (new_stmt, new_temp);
4753 if (compute_in_loop)
4754 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4755 else
4757 /* Generate the misalignment computation outside LOOP. */
4758 pe = loop_preheader_edge (loop);
4759 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4760 gcc_assert (!new_bb);
4763 *realignment_token = gimple_call_lhs (new_stmt);
4765 /* The result of the CALL_EXPR to this builtin is determined from
4766 the value of the parameter and no global variables are touched
4767 which makes the builtin a "const" function. Requiring the
4768 builtin to have the "const" attribute makes it unnecessary
4769 to call mark_call_clobbered. */
4770 gcc_assert (TREE_READONLY (builtin_decl));
4773 if (alignment_support_scheme == dr_explicit_realign)
4774 return msq;
4776 gcc_assert (!compute_in_loop);
4777 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4780 /* 5. Create msq = phi <msq_init, lsq> in loop */
4782 pe = loop_preheader_edge (containing_loop);
4783 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4784 msq = make_ssa_name (vec_dest, NULL);
4785 phi_stmt = create_phi_node (msq, containing_loop->header);
4786 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4788 return msq;
4792 /* Function vect_grouped_load_supported.
4794 Returns TRUE if even and odd permutations are supported,
4795 and FALSE otherwise. */
4797 bool
4798 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4800 enum machine_mode mode = TYPE_MODE (vectype);
4802 /* vect_permute_load_chain requires the group size to be a power of two. */
4803 if (exact_log2 (count) == -1)
4805 if (dump_enabled_p ())
4806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4807 "the size of the group of accesses"
4808 " is not a power of 2\n");
4809 return false;
4812 /* Check that the permutation is supported. */
4813 if (VECTOR_MODE_P (mode))
4815 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4816 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4818 for (i = 0; i < nelt; i++)
4819 sel[i] = i * 2;
4820 if (can_vec_perm_p (mode, false, sel))
4822 for (i = 0; i < nelt; i++)
4823 sel[i] = i * 2 + 1;
4824 if (can_vec_perm_p (mode, false, sel))
4825 return true;
4829 if (dump_enabled_p ())
4830 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4831 "extract even/odd not supported by target\n");
4832 return false;
4835 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4836 type VECTYPE. */
4838 bool
4839 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4841 return vect_lanes_optab_supported_p ("vec_load_lanes",
4842 vec_load_lanes_optab,
4843 vectype, count);
4846 /* Function vect_permute_load_chain.
4848 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4849 a power of 2, generate extract_even/odd stmts to reorder the input data
4850 correctly. Return the final references for loads in RESULT_CHAIN.
4852 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4853 The input is 4 vectors each containing 8 elements. We assign a number to each
4854 element, the input sequence is:
4856 1st vec: 0 1 2 3 4 5 6 7
4857 2nd vec: 8 9 10 11 12 13 14 15
4858 3rd vec: 16 17 18 19 20 21 22 23
4859 4th vec: 24 25 26 27 28 29 30 31
4861 The output sequence should be:
4863 1st vec: 0 4 8 12 16 20 24 28
4864 2nd vec: 1 5 9 13 17 21 25 29
4865 3rd vec: 2 6 10 14 18 22 26 30
4866 4th vec: 3 7 11 15 19 23 27 31
4868 i.e., the first output vector should contain the first elements of each
4869 interleaving group, etc.
4871 We use extract_even/odd instructions to create such output. The input of
4872 each extract_even/odd operation is two vectors
4873 1st vec 2nd vec
4874 0 1 2 3 4 5 6 7
4876 and the output is the vector of extracted even/odd elements. The output of
4877 extract_even will be: 0 2 4 6
4878 and of extract_odd: 1 3 5 7
4881 The permutation is done in log LENGTH stages. In each stage extract_even
4882 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4883 their order. In our example,
4885 E1: extract_even (1st vec, 2nd vec)
4886 E2: extract_odd (1st vec, 2nd vec)
4887 E3: extract_even (3rd vec, 4th vec)
4888 E4: extract_odd (3rd vec, 4th vec)
4890 The output for the first stage will be:
4892 E1: 0 2 4 6 8 10 12 14
4893 E2: 1 3 5 7 9 11 13 15
4894 E3: 16 18 20 22 24 26 28 30
4895 E4: 17 19 21 23 25 27 29 31
4897 In order to proceed and create the correct sequence for the next stage (or
4898 for the correct output, if the second stage is the last one, as in our
4899 example), we first put the output of extract_even operation and then the
4900 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4901 The input for the second stage is:
4903 1st vec (E1): 0 2 4 6 8 10 12 14
4904 2nd vec (E3): 16 18 20 22 24 26 28 30
4905 3rd vec (E2): 1 3 5 7 9 11 13 15
4906 4th vec (E4): 17 19 21 23 25 27 29 31
4908 The output of the second stage:
4910 E1: 0 4 8 12 16 20 24 28
4911 E2: 2 6 10 14 18 22 26 30
4912 E3: 1 5 9 13 17 21 25 29
4913 E4: 3 7 11 15 19 23 27 31
4915 And RESULT_CHAIN after reordering:
4917 1st vec (E1): 0 4 8 12 16 20 24 28
4918 2nd vec (E3): 1 5 9 13 17 21 25 29
4919 3rd vec (E2): 2 6 10 14 18 22 26 30
4920 4th vec (E4): 3 7 11 15 19 23 27 31. */
4922 static void
4923 vect_permute_load_chain (vec<tree> dr_chain,
4924 unsigned int length,
4925 gimple stmt,
4926 gimple_stmt_iterator *gsi,
4927 vec<tree> *result_chain)
4929 tree data_ref, first_vect, second_vect;
4930 tree perm_mask_even, perm_mask_odd;
4931 gimple perm_stmt;
4932 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4933 unsigned int i, j, log_length = exact_log2 (length);
4934 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4935 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4937 result_chain->quick_grow (length);
4938 memcpy (result_chain->address (), dr_chain.address (),
4939 length * sizeof (tree));
4941 for (i = 0; i < nelt; ++i)
4942 sel[i] = i * 2;
4943 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4944 gcc_assert (perm_mask_even != NULL);
4946 for (i = 0; i < nelt; ++i)
4947 sel[i] = i * 2 + 1;
4948 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4949 gcc_assert (perm_mask_odd != NULL);
4951 for (i = 0; i < log_length; i++)
4953 for (j = 0; j < length; j += 2)
4955 first_vect = dr_chain[j];
4956 second_vect = dr_chain[j+1];
4958 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4959 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4960 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4961 first_vect, second_vect,
4962 perm_mask_even);
4963 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4964 (*result_chain)[j/2] = data_ref;
4966 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4967 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4968 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4969 first_vect, second_vect,
4970 perm_mask_odd);
4971 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4972 (*result_chain)[j/2+length/2] = data_ref;
4974 memcpy (dr_chain.address (), result_chain->address (),
4975 length * sizeof (tree));
4980 /* Function vect_transform_grouped_load.
4982 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4983 to perform their permutation and ascribe the result vectorized statements to
4984 the scalar statements.
4987 void
4988 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4989 gimple_stmt_iterator *gsi)
4991 vec<tree> result_chain = vNULL;
4993 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4994 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4995 vectors, that are ready for vector computation. */
4996 result_chain.create (size);
4997 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4998 vect_record_grouped_load_vectors (stmt, result_chain);
4999 result_chain.release ();
5002 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5003 generated as part of the vectorization of STMT. Assign the statement
5004 for each vector to the associated scalar statement. */
5006 void
5007 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5009 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5010 gimple next_stmt, new_stmt;
5011 unsigned int i, gap_count;
5012 tree tmp_data_ref;
5014 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5015 Since we scan the chain starting from it's first node, their order
5016 corresponds the order of data-refs in RESULT_CHAIN. */
5017 next_stmt = first_stmt;
5018 gap_count = 1;
5019 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5021 if (!next_stmt)
5022 break;
5024 /* Skip the gaps. Loads created for the gaps will be removed by dead
5025 code elimination pass later. No need to check for the first stmt in
5026 the group, since it always exists.
5027 GROUP_GAP is the number of steps in elements from the previous
5028 access (if there is no gap GROUP_GAP is 1). We skip loads that
5029 correspond to the gaps. */
5030 if (next_stmt != first_stmt
5031 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5033 gap_count++;
5034 continue;
5037 while (next_stmt)
5039 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5040 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5041 copies, and we put the new vector statement in the first available
5042 RELATED_STMT. */
5043 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5044 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5045 else
5047 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5049 gimple prev_stmt =
5050 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5051 gimple rel_stmt =
5052 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5053 while (rel_stmt)
5055 prev_stmt = rel_stmt;
5056 rel_stmt =
5057 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5060 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5061 new_stmt;
5065 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5066 gap_count = 1;
5067 /* If NEXT_STMT accesses the same DR as the previous statement,
5068 put the same TMP_DATA_REF as its vectorized statement; otherwise
5069 get the next data-ref from RESULT_CHAIN. */
5070 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5071 break;
5076 /* Function vect_force_dr_alignment_p.
5078 Returns whether the alignment of a DECL can be forced to be aligned
5079 on ALIGNMENT bit boundary. */
5081 bool
5082 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5084 if (TREE_CODE (decl) != VAR_DECL)
5085 return false;
5087 /* We cannot change alignment of common or external symbols as another
5088 translation unit may contain a definition with lower alignment.
5089 The rules of common symbol linking mean that the definition
5090 will override the common symbol. The same is true for constant
5091 pool entries which may be shared and are not properly merged
5092 by LTO. */
5093 if (DECL_EXTERNAL (decl)
5094 || DECL_COMMON (decl)
5095 || DECL_IN_CONSTANT_POOL (decl))
5096 return false;
5098 if (TREE_ASM_WRITTEN (decl))
5099 return false;
5101 /* Do not override the alignment as specified by the ABI when the used
5102 attribute is set. */
5103 if (DECL_PRESERVE_P (decl))
5104 return false;
5106 /* Do not override explicit alignment set by the user when an explicit
5107 section name is also used. This is a common idiom used by many
5108 software projects. */
5109 if (DECL_SECTION_NAME (decl) != NULL_TREE
5110 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
5111 return false;
5113 if (TREE_STATIC (decl))
5114 return (alignment <= MAX_OFILE_ALIGNMENT);
5115 else
5116 return (alignment <= MAX_STACK_ALIGNMENT);
5120 /* Return whether the data reference DR is supported with respect to its
5121 alignment.
5122 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5123 it is aligned, i.e., check if it is possible to vectorize it with different
5124 alignment. */
5126 enum dr_alignment_support
5127 vect_supportable_dr_alignment (struct data_reference *dr,
5128 bool check_aligned_accesses)
5130 gimple stmt = DR_STMT (dr);
5131 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5132 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5133 enum machine_mode mode = TYPE_MODE (vectype);
5134 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5135 struct loop *vect_loop = NULL;
5136 bool nested_in_vect_loop = false;
5138 if (aligned_access_p (dr) && !check_aligned_accesses)
5139 return dr_aligned;
5141 /* For now assume all conditional loads/stores support unaligned
5142 access without any special code. */
5143 if (is_gimple_call (stmt)
5144 && gimple_call_internal_p (stmt)
5145 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5146 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5147 return dr_unaligned_supported;
5149 if (loop_vinfo)
5151 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5152 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5155 /* Possibly unaligned access. */
5157 /* We can choose between using the implicit realignment scheme (generating
5158 a misaligned_move stmt) and the explicit realignment scheme (generating
5159 aligned loads with a REALIGN_LOAD). There are two variants to the
5160 explicit realignment scheme: optimized, and unoptimized.
5161 We can optimize the realignment only if the step between consecutive
5162 vector loads is equal to the vector size. Since the vector memory
5163 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5164 is guaranteed that the misalignment amount remains the same throughout the
5165 execution of the vectorized loop. Therefore, we can create the
5166 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5167 at the loop preheader.
5169 However, in the case of outer-loop vectorization, when vectorizing a
5170 memory access in the inner-loop nested within the LOOP that is now being
5171 vectorized, while it is guaranteed that the misalignment of the
5172 vectorized memory access will remain the same in different outer-loop
5173 iterations, it is *not* guaranteed that is will remain the same throughout
5174 the execution of the inner-loop. This is because the inner-loop advances
5175 with the original scalar step (and not in steps of VS). If the inner-loop
5176 step happens to be a multiple of VS, then the misalignment remains fixed
5177 and we can use the optimized realignment scheme. For example:
5179 for (i=0; i<N; i++)
5180 for (j=0; j<M; j++)
5181 s += a[i+j];
5183 When vectorizing the i-loop in the above example, the step between
5184 consecutive vector loads is 1, and so the misalignment does not remain
5185 fixed across the execution of the inner-loop, and the realignment cannot
5186 be optimized (as illustrated in the following pseudo vectorized loop):
5188 for (i=0; i<N; i+=4)
5189 for (j=0; j<M; j++){
5190 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5191 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5192 // (assuming that we start from an aligned address).
5195 We therefore have to use the unoptimized realignment scheme:
5197 for (i=0; i<N; i+=4)
5198 for (j=k; j<M; j+=4)
5199 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5200 // that the misalignment of the initial address is
5201 // 0).
5203 The loop can then be vectorized as follows:
5205 for (k=0; k<4; k++){
5206 rt = get_realignment_token (&vp[k]);
5207 for (i=0; i<N; i+=4){
5208 v1 = vp[i+k];
5209 for (j=k; j<M; j+=4){
5210 v2 = vp[i+j+VS-1];
5211 va = REALIGN_LOAD <v1,v2,rt>;
5212 vs += va;
5213 v1 = v2;
5216 } */
5218 if (DR_IS_READ (dr))
5220 bool is_packed = false;
5221 tree type = (TREE_TYPE (DR_REF (dr)));
5223 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5224 && (!targetm.vectorize.builtin_mask_for_load
5225 || targetm.vectorize.builtin_mask_for_load ()))
5227 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5228 if ((nested_in_vect_loop
5229 && (TREE_INT_CST_LOW (DR_STEP (dr))
5230 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5231 || !loop_vinfo)
5232 return dr_explicit_realign;
5233 else
5234 return dr_explicit_realign_optimized;
5236 if (!known_alignment_for_access_p (dr))
5237 is_packed = not_size_aligned (DR_REF (dr));
5239 if ((TYPE_USER_ALIGN (type) && !is_packed)
5240 || targetm.vectorize.
5241 support_vector_misalignment (mode, type,
5242 DR_MISALIGNMENT (dr), is_packed))
5243 /* Can't software pipeline the loads, but can at least do them. */
5244 return dr_unaligned_supported;
5246 else
5248 bool is_packed = false;
5249 tree type = (TREE_TYPE (DR_REF (dr)));
5251 if (!known_alignment_for_access_p (dr))
5252 is_packed = not_size_aligned (DR_REF (dr));
5254 if ((TYPE_USER_ALIGN (type) && !is_packed)
5255 || targetm.vectorize.
5256 support_vector_misalignment (mode, type,
5257 DR_MISALIGNMENT (dr), is_packed))
5258 return dr_unaligned_supported;
5261 /* Unsupported. */
5262 return dr_unaligned_unsupported;