Mark as release
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob49303b1b58aa100bab1347ccdf65fedd4ec94d9f
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2014 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "stor-layout.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-ssa-alias.h"
34 #include "internal-fn.h"
35 #include "tree-eh.h"
36 #include "gimple-expr.h"
37 #include "is-a.h"
38 #include "gimple.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "gimple-ssa.h"
43 #include "tree-phinodes.h"
44 #include "ssa-iterators.h"
45 #include "stringpool.h"
46 #include "tree-ssanames.h"
47 #include "tree-ssa-loop-ivopts.h"
48 #include "tree-ssa-loop-manip.h"
49 #include "tree-ssa-loop.h"
50 #include "dumpfile.h"
51 #include "cfgloop.h"
52 #include "tree-chrec.h"
53 #include "tree-scalar-evolution.h"
54 #include "tree-vectorizer.h"
55 #include "diagnostic-core.h"
56 #include "cgraph.h"
57 /* Need to include rtl.h, expr.h, etc. for optabs. */
58 #include "expr.h"
59 #include "optabs.h"
61 /* Return true if load- or store-lanes optab OPTAB is implemented for
62 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
64 static bool
65 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
66 tree vectype, unsigned HOST_WIDE_INT count)
68 enum machine_mode mode, array_mode;
69 bool limit_p;
71 mode = TYPE_MODE (vectype);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
74 MODE_INT, limit_p);
76 if (array_mode == BLKmode)
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
80 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
81 GET_MODE_NAME (mode), count);
82 return false;
85 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
89 "cannot use %s<%s><%s>\n", name,
90 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
91 return false;
94 if (dump_enabled_p ())
95 dump_printf_loc (MSG_NOTE, vect_location,
96 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
97 GET_MODE_NAME (mode));
99 return true;
103 /* Return the smallest scalar part of STMT.
104 This is used to determine the vectype of the stmt. We generally set the
105 vectype according to the type of the result (lhs). For stmts whose
106 result-type is different than the type of the arguments (e.g., demotion,
107 promotion), vectype will be reset appropriately (later). Note that we have
108 to visit the smallest datatype in this function, because that determines the
109 VF. If the smallest datatype in the loop is present only as the rhs of a
110 promotion operation - we'd miss it.
111 Such a case, where a variable of this datatype does not appear in the lhs
112 anywhere in the loop, can only occur if it's an invariant: e.g.:
113 'int_x = (int) short_inv', which we'd expect to have been optimized away by
114 invariant motion. However, we cannot rely on invariant motion to always
115 take invariants out of the loop, and so in the case of promotion we also
116 have to check the rhs.
117 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
118 types. */
120 tree
121 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
122 HOST_WIDE_INT *rhs_size_unit)
124 tree scalar_type = gimple_expr_type (stmt);
125 HOST_WIDE_INT lhs, rhs;
127 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
129 if (is_gimple_assign (stmt)
130 && (gimple_assign_cast_p (stmt)
131 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
132 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
133 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
135 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
137 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
138 if (rhs < lhs)
139 scalar_type = rhs_type;
142 *lhs_size_unit = lhs;
143 *rhs_size_unit = rhs;
144 return scalar_type;
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149 tested at run-time. Return TRUE if DDR was successfully inserted.
150 Return false if versioning is not supported. */
152 static bool
153 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
155 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
157 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
158 return false;
160 if (dump_enabled_p ())
162 dump_printf_loc (MSG_NOTE, vect_location,
163 "mark for run-time aliasing test between ");
164 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
165 dump_printf (MSG_NOTE, " and ");
166 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
167 dump_printf (MSG_NOTE, "\n");
170 if (optimize_loop_nest_for_size_p (loop))
172 if (dump_enabled_p ())
173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
174 "versioning not supported when optimizing"
175 " for size.\n");
176 return false;
179 /* FORNOW: We don't support versioning with outer-loop vectorization. */
180 if (loop->inner)
182 if (dump_enabled_p ())
183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
184 "versioning not yet supported for outer-loops.\n");
185 return false;
188 /* FORNOW: We don't support creating runtime alias tests for non-constant
189 step. */
190 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
191 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
193 if (dump_enabled_p ())
194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
195 "versioning not yet supported for non-constant "
196 "step\n");
197 return false;
200 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
201 return true;
205 /* Function vect_analyze_data_ref_dependence.
207 Return TRUE if there (might) exist a dependence between a memory-reference
208 DRA and a memory-reference DRB. When versioning for alias may check a
209 dependence at run-time, return FALSE. Adjust *MAX_VF according to
210 the data dependence. */
212 static bool
213 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
214 loop_vec_info loop_vinfo, int *max_vf)
216 unsigned int i;
217 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
218 struct data_reference *dra = DDR_A (ddr);
219 struct data_reference *drb = DDR_B (ddr);
220 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
221 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
222 lambda_vector dist_v;
223 unsigned int loop_depth;
225 /* In loop analysis all data references should be vectorizable. */
226 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
227 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
228 gcc_unreachable ();
230 /* Independent data accesses. */
231 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
232 return false;
234 if (dra == drb
235 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
236 return false;
238 /* Even if we have an anti-dependence then, as the vectorized loop covers at
239 least two scalar iterations, there is always also a true dependence.
240 As the vectorizer does not re-order loads and stores we can ignore
241 the anti-dependence if TBAA can disambiguate both DRs similar to the
242 case with known negative distance anti-dependences (positive
243 distance anti-dependences would violate TBAA constraints). */
244 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
245 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
246 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
247 get_alias_set (DR_REF (drb))))
248 return false;
250 /* Unknown data dependence. */
251 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
253 /* If user asserted safelen consecutive iterations can be
254 executed concurrently, assume independence. */
255 if (loop->safelen >= 2)
257 if (loop->safelen < *max_vf)
258 *max_vf = loop->safelen;
259 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
260 return false;
263 if (STMT_VINFO_GATHER_P (stmtinfo_a)
264 || STMT_VINFO_GATHER_P (stmtinfo_b))
266 if (dump_enabled_p ())
268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
269 "versioning for alias not supported for: "
270 "can't determine dependence between ");
271 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
272 DR_REF (dra));
273 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
274 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
275 DR_REF (drb));
276 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
278 return true;
281 if (dump_enabled_p ())
283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
284 "versioning for alias required: "
285 "can't determine dependence between ");
286 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
287 DR_REF (dra));
288 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
289 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
290 DR_REF (drb));
291 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
294 /* Add to list of ddrs that need to be tested at run-time. */
295 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
298 /* Known data dependence. */
299 if (DDR_NUM_DIST_VECTS (ddr) == 0)
301 /* If user asserted safelen consecutive iterations can be
302 executed concurrently, assume independence. */
303 if (loop->safelen >= 2)
305 if (loop->safelen < *max_vf)
306 *max_vf = loop->safelen;
307 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
308 return false;
311 if (STMT_VINFO_GATHER_P (stmtinfo_a)
312 || STMT_VINFO_GATHER_P (stmtinfo_b))
314 if (dump_enabled_p ())
316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
317 "versioning for alias not supported for: "
318 "bad dist vector for ");
319 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
320 DR_REF (dra));
321 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
323 DR_REF (drb));
324 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
326 return true;
329 if (dump_enabled_p ())
331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
332 "versioning for alias required: "
333 "bad dist vector for ");
334 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
335 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
336 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
337 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
339 /* Add to list of ddrs that need to be tested at run-time. */
340 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
343 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
344 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
346 int dist = dist_v[loop_depth];
348 if (dump_enabled_p ())
349 dump_printf_loc (MSG_NOTE, vect_location,
350 "dependence distance = %d.\n", dist);
352 if (dist == 0)
354 if (dump_enabled_p ())
356 dump_printf_loc (MSG_NOTE, vect_location,
357 "dependence distance == 0 between ");
358 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
359 dump_printf (MSG_NOTE, " and ");
360 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
361 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
364 /* When we perform grouped accesses and perform implicit CSE
365 by detecting equal accesses and doing disambiguation with
366 runtime alias tests like for
367 .. = a[i];
368 .. = a[i+1];
369 a[i] = ..;
370 a[i+1] = ..;
371 *p = ..;
372 .. = a[i];
373 .. = a[i+1];
374 where we will end up loading { a[i], a[i+1] } once, make
375 sure that inserting group loads before the first load and
376 stores after the last store will do the right thing.
377 Similar for groups like
378 a[i] = ...;
379 ... = a[i];
380 a[i+1] = ...;
381 where loads from the group interleave with the store. */
382 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
383 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
385 gimple earlier_stmt;
386 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
387 if (DR_IS_WRITE
388 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
390 if (dump_enabled_p ())
391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
392 "READ_WRITE dependence in interleaving."
393 "\n");
394 return true;
398 continue;
401 if (dist > 0 && DDR_REVERSED_P (ddr))
403 /* If DDR_REVERSED_P the order of the data-refs in DDR was
404 reversed (to make distance vector positive), and the actual
405 distance is negative. */
406 if (dump_enabled_p ())
407 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
408 "dependence distance negative.\n");
409 /* Record a negative dependence distance to later limit the
410 amount of stmt copying / unrolling we can perform.
411 Only need to handle read-after-write dependence. */
412 if (DR_IS_READ (drb)
413 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
414 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
415 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
416 continue;
419 if (abs (dist) >= 2
420 && abs (dist) < *max_vf)
422 /* The dependence distance requires reduction of the maximal
423 vectorization factor. */
424 *max_vf = abs (dist);
425 if (dump_enabled_p ())
426 dump_printf_loc (MSG_NOTE, vect_location,
427 "adjusting maximal vectorization factor to %i\n",
428 *max_vf);
431 if (abs (dist) >= *max_vf)
433 /* Dependence distance does not create dependence, as far as
434 vectorization is concerned, in this case. */
435 if (dump_enabled_p ())
436 dump_printf_loc (MSG_NOTE, vect_location,
437 "dependence distance >= VF.\n");
438 continue;
441 if (dump_enabled_p ())
443 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
444 "not vectorized, possible dependence "
445 "between data-refs ");
446 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
447 dump_printf (MSG_NOTE, " and ");
448 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
449 dump_printf (MSG_NOTE, "\n");
452 return true;
455 return false;
458 /* Function vect_analyze_data_ref_dependences.
460 Examine all the data references in the loop, and make sure there do not
461 exist any data dependences between them. Set *MAX_VF according to
462 the maximum vectorization factor the data dependences allow. */
464 bool
465 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
467 unsigned int i;
468 struct data_dependence_relation *ddr;
470 if (dump_enabled_p ())
471 dump_printf_loc (MSG_NOTE, vect_location,
472 "=== vect_analyze_data_ref_dependences ===\n");
474 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
475 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
476 &LOOP_VINFO_DDRS (loop_vinfo),
477 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
478 return false;
480 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
481 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
482 return false;
484 return true;
488 /* Function vect_slp_analyze_data_ref_dependence.
490 Return TRUE if there (might) exist a dependence between a memory-reference
491 DRA and a memory-reference DRB. When versioning for alias may check a
492 dependence at run-time, return FALSE. Adjust *MAX_VF according to
493 the data dependence. */
495 static bool
496 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
498 struct data_reference *dra = DDR_A (ddr);
499 struct data_reference *drb = DDR_B (ddr);
501 /* We need to check dependences of statements marked as unvectorizable
502 as well, they still can prohibit vectorization. */
504 /* Independent data accesses. */
505 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
506 return false;
508 if (dra == drb)
509 return false;
511 /* Read-read is OK. */
512 if (DR_IS_READ (dra) && DR_IS_READ (drb))
513 return false;
515 /* If dra and drb are part of the same interleaving chain consider
516 them independent. */
517 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
518 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
519 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
520 return false;
522 /* Unknown data dependence. */
523 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
525 if (dump_enabled_p ())
527 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
528 "can't determine dependence between ");
529 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
530 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
531 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
532 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
535 else if (dump_enabled_p ())
537 dump_printf_loc (MSG_NOTE, vect_location,
538 "determined dependence between ");
539 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
540 dump_printf (MSG_NOTE, " and ");
541 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
542 dump_printf (MSG_NOTE, "\n");
545 /* We do not vectorize basic blocks with write-write dependencies. */
546 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
547 return true;
549 /* If we have a read-write dependence check that the load is before the store.
550 When we vectorize basic blocks, vector load can be only before
551 corresponding scalar load, and vector store can be only after its
552 corresponding scalar store. So the order of the acceses is preserved in
553 case the load is before the store. */
554 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
555 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
557 /* That only holds for load-store pairs taking part in vectorization. */
558 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
559 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
560 return false;
563 return true;
567 /* Function vect_analyze_data_ref_dependences.
569 Examine all the data references in the basic-block, and make sure there
570 do not exist any data dependences between them. Set *MAX_VF according to
571 the maximum vectorization factor the data dependences allow. */
573 bool
574 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
576 struct data_dependence_relation *ddr;
577 unsigned int i;
579 if (dump_enabled_p ())
580 dump_printf_loc (MSG_NOTE, vect_location,
581 "=== vect_slp_analyze_data_ref_dependences ===\n");
583 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
584 &BB_VINFO_DDRS (bb_vinfo),
585 vNULL, true))
586 return false;
588 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
589 if (vect_slp_analyze_data_ref_dependence (ddr))
590 return false;
592 return true;
596 /* Function vect_compute_data_ref_alignment
598 Compute the misalignment of the data reference DR.
600 Output:
601 1. If during the misalignment computation it is found that the data reference
602 cannot be vectorized then false is returned.
603 2. DR_MISALIGNMENT (DR) is defined.
605 FOR NOW: No analysis is actually performed. Misalignment is calculated
606 only for trivial cases. TODO. */
608 static bool
609 vect_compute_data_ref_alignment (struct data_reference *dr)
611 gimple stmt = DR_STMT (dr);
612 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
613 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
614 struct loop *loop = NULL;
615 tree ref = DR_REF (dr);
616 tree vectype;
617 tree base, base_addr;
618 bool base_aligned;
619 tree misalign;
620 tree aligned_to, alignment;
622 if (dump_enabled_p ())
623 dump_printf_loc (MSG_NOTE, vect_location,
624 "vect_compute_data_ref_alignment:\n");
626 if (loop_vinfo)
627 loop = LOOP_VINFO_LOOP (loop_vinfo);
629 /* Initialize misalignment to unknown. */
630 SET_DR_MISALIGNMENT (dr, -1);
632 /* Strided loads perform only component accesses, misalignment information
633 is irrelevant for them. */
634 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
635 return true;
637 misalign = DR_INIT (dr);
638 aligned_to = DR_ALIGNED_TO (dr);
639 base_addr = DR_BASE_ADDRESS (dr);
640 vectype = STMT_VINFO_VECTYPE (stmt_info);
642 /* In case the dataref is in an inner-loop of the loop that is being
643 vectorized (LOOP), we use the base and misalignment information
644 relative to the outer-loop (LOOP). This is ok only if the misalignment
645 stays the same throughout the execution of the inner-loop, which is why
646 we have to check that the stride of the dataref in the inner-loop evenly
647 divides by the vector size. */
648 if (loop && nested_in_vect_loop_p (loop, stmt))
650 tree step = DR_STEP (dr);
651 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
653 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
655 if (dump_enabled_p ())
656 dump_printf_loc (MSG_NOTE, vect_location,
657 "inner step divides the vector-size.\n");
658 misalign = STMT_VINFO_DR_INIT (stmt_info);
659 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
660 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
662 else
664 if (dump_enabled_p ())
665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
666 "inner step doesn't divide the vector-size.\n");
667 misalign = NULL_TREE;
671 /* Similarly, if we're doing basic-block vectorization, we can only use
672 base and misalignment information relative to an innermost loop if the
673 misalignment stays the same throughout the execution of the loop.
674 As above, this is the case if the stride of the dataref evenly divides
675 by the vector size. */
676 if (!loop)
678 tree step = DR_STEP (dr);
679 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
681 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
683 if (dump_enabled_p ())
684 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
685 "SLP: step doesn't divide the vector-size.\n");
686 misalign = NULL_TREE;
690 base = build_fold_indirect_ref (base_addr);
691 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
693 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
694 || !misalign)
696 if (dump_enabled_p ())
698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
699 "Unknown alignment for access: ");
700 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
701 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
703 return true;
706 if ((DECL_P (base)
707 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
708 alignment) >= 0)
709 || (TREE_CODE (base_addr) == SSA_NAME
710 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
711 TREE_TYPE (base_addr)))),
712 alignment) >= 0)
713 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
714 base_aligned = true;
715 else
716 base_aligned = false;
718 if (!base_aligned)
720 /* Do not change the alignment of global variables here if
721 flag_section_anchors is enabled as we already generated
722 RTL for other functions. Most global variables should
723 have been aligned during the IPA increase_alignment pass. */
724 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
725 || (TREE_STATIC (base) && flag_section_anchors))
727 if (dump_enabled_p ())
729 dump_printf_loc (MSG_NOTE, vect_location,
730 "can't force alignment of ref: ");
731 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
732 dump_printf (MSG_NOTE, "\n");
734 return true;
737 /* Force the alignment of the decl.
738 NOTE: This is the only change to the code we make during
739 the analysis phase, before deciding to vectorize the loop. */
740 if (dump_enabled_p ())
742 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
743 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
744 dump_printf (MSG_NOTE, "\n");
747 ((dataref_aux *)dr->aux)->base_decl = base;
748 ((dataref_aux *)dr->aux)->base_misaligned = true;
751 /* If this is a backward running DR then first access in the larger
752 vectype actually is N-1 elements before the address in the DR.
753 Adjust misalign accordingly. */
754 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
756 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
757 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
758 otherwise we wouldn't be here. */
759 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
760 /* PLUS because DR_STEP was negative. */
761 misalign = size_binop (PLUS_EXPR, misalign, offset);
764 /* Modulo alignment. */
765 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
767 if (!tree_fits_uhwi_p (misalign))
769 /* Negative or overflowed misalignment value. */
770 if (dump_enabled_p ())
771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
772 "unexpected misalign value\n");
773 return false;
776 SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
778 if (dump_enabled_p ())
780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
781 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
782 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
783 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
786 return true;
790 /* Function vect_compute_data_refs_alignment
792 Compute the misalignment of data references in the loop.
793 Return FALSE if a data reference is found that cannot be vectorized. */
795 static bool
796 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
797 bb_vec_info bb_vinfo)
799 vec<data_reference_p> datarefs;
800 struct data_reference *dr;
801 unsigned int i;
803 if (loop_vinfo)
804 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
805 else
806 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
808 FOR_EACH_VEC_ELT (datarefs, i, dr)
809 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
810 && !vect_compute_data_ref_alignment (dr))
812 if (bb_vinfo)
814 /* Mark unsupported statement as unvectorizable. */
815 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
816 continue;
818 else
819 return false;
822 return true;
826 /* Function vect_update_misalignment_for_peel
828 DR - the data reference whose misalignment is to be adjusted.
829 DR_PEEL - the data reference whose misalignment is being made
830 zero in the vector loop by the peel.
831 NPEEL - the number of iterations in the peel loop if the misalignment
832 of DR_PEEL is known at compile time. */
834 static void
835 vect_update_misalignment_for_peel (struct data_reference *dr,
836 struct data_reference *dr_peel, int npeel)
838 unsigned int i;
839 vec<dr_p> same_align_drs;
840 struct data_reference *current_dr;
841 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
842 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
843 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
844 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
846 /* For interleaved data accesses the step in the loop must be multiplied by
847 the size of the interleaving group. */
848 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
849 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
850 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
851 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
853 /* It can be assumed that the data refs with the same alignment as dr_peel
854 are aligned in the vector loop. */
855 same_align_drs
856 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
857 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
859 if (current_dr != dr)
860 continue;
861 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
862 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
863 SET_DR_MISALIGNMENT (dr, 0);
864 return;
867 if (known_alignment_for_access_p (dr)
868 && known_alignment_for_access_p (dr_peel))
870 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
871 int misal = DR_MISALIGNMENT (dr);
872 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
873 misal += negative ? -npeel * dr_size : npeel * dr_size;
874 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
875 SET_DR_MISALIGNMENT (dr, misal);
876 return;
879 if (dump_enabled_p ())
880 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
881 SET_DR_MISALIGNMENT (dr, -1);
885 /* Function vect_verify_datarefs_alignment
887 Return TRUE if all data references in the loop can be
888 handled with respect to alignment. */
890 bool
891 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
893 vec<data_reference_p> datarefs;
894 struct data_reference *dr;
895 enum dr_alignment_support supportable_dr_alignment;
896 unsigned int i;
898 if (loop_vinfo)
899 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
900 else
901 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
903 FOR_EACH_VEC_ELT (datarefs, i, dr)
905 gimple stmt = DR_STMT (dr);
906 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
908 if (!STMT_VINFO_RELEVANT_P (stmt_info))
909 continue;
911 /* For interleaving, only the alignment of the first access matters.
912 Skip statements marked as not vectorizable. */
913 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
914 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
915 || !STMT_VINFO_VECTORIZABLE (stmt_info))
916 continue;
918 /* Strided loads perform only component accesses, alignment is
919 irrelevant for them. */
920 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
921 continue;
923 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
924 if (!supportable_dr_alignment)
926 if (dump_enabled_p ())
928 if (DR_IS_READ (dr))
929 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
930 "not vectorized: unsupported unaligned load.");
931 else
932 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
933 "not vectorized: unsupported unaligned "
934 "store.");
936 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
937 DR_REF (dr));
938 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
940 return false;
942 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
943 dump_printf_loc (MSG_NOTE, vect_location,
944 "Vectorizing an unaligned access.\n");
946 return true;
949 /* Given an memory reference EXP return whether its alignment is less
950 than its size. */
952 static bool
953 not_size_aligned (tree exp)
955 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
956 return true;
958 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
959 > get_object_alignment (exp));
962 /* Function vector_alignment_reachable_p
964 Return true if vector alignment for DR is reachable by peeling
965 a few loop iterations. Return false otherwise. */
967 static bool
968 vector_alignment_reachable_p (struct data_reference *dr)
970 gimple stmt = DR_STMT (dr);
971 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
972 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
974 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
976 /* For interleaved access we peel only if number of iterations in
977 the prolog loop ({VF - misalignment}), is a multiple of the
978 number of the interleaved accesses. */
979 int elem_size, mis_in_elements;
980 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
982 /* FORNOW: handle only known alignment. */
983 if (!known_alignment_for_access_p (dr))
984 return false;
986 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
987 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
989 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
990 return false;
993 /* If misalignment is known at the compile time then allow peeling
994 only if natural alignment is reachable through peeling. */
995 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
997 HOST_WIDE_INT elmsize =
998 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
999 if (dump_enabled_p ())
1001 dump_printf_loc (MSG_NOTE, vect_location,
1002 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1003 dump_printf (MSG_NOTE,
1004 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1006 if (DR_MISALIGNMENT (dr) % elmsize)
1008 if (dump_enabled_p ())
1009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1010 "data size does not divide the misalignment.\n");
1011 return false;
1015 if (!known_alignment_for_access_p (dr))
1017 tree type = TREE_TYPE (DR_REF (dr));
1018 bool is_packed = not_size_aligned (DR_REF (dr));
1019 if (dump_enabled_p ())
1020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1021 "Unknown misalignment, is_packed = %d\n",is_packed);
1022 if ((TYPE_USER_ALIGN (type) && !is_packed)
1023 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1024 return true;
1025 else
1026 return false;
1029 return true;
1033 /* Calculate the cost of the memory access represented by DR. */
1035 static void
1036 vect_get_data_access_cost (struct data_reference *dr,
1037 unsigned int *inside_cost,
1038 unsigned int *outside_cost,
1039 stmt_vector_for_cost *body_cost_vec)
1041 gimple stmt = DR_STMT (dr);
1042 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1043 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1044 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1045 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1046 int ncopies = vf / nunits;
1048 if (DR_IS_READ (dr))
1049 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1050 NULL, body_cost_vec, false);
1051 else
1052 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1054 if (dump_enabled_p ())
1055 dump_printf_loc (MSG_NOTE, vect_location,
1056 "vect_get_data_access_cost: inside_cost = %d, "
1057 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1061 /* Insert DR into peeling hash table with NPEEL as key. */
1063 static void
1064 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1065 int npeel)
1067 struct _vect_peel_info elem, *slot;
1068 _vect_peel_info **new_slot;
1069 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1071 elem.npeel = npeel;
1072 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
1073 if (slot)
1074 slot->count++;
1075 else
1077 slot = XNEW (struct _vect_peel_info);
1078 slot->npeel = npeel;
1079 slot->dr = dr;
1080 slot->count = 1;
1081 new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
1082 *new_slot = slot;
1085 if (!supportable_dr_alignment
1086 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1087 slot->count += VECT_MAX_COST;
1091 /* Traverse peeling hash table to find peeling option that aligns maximum
1092 number of data accesses. */
1095 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1096 _vect_peel_extended_info *max)
1098 vect_peel_info elem = *slot;
1100 if (elem->count > max->peel_info.count
1101 || (elem->count == max->peel_info.count
1102 && max->peel_info.npeel > elem->npeel))
1104 max->peel_info.npeel = elem->npeel;
1105 max->peel_info.count = elem->count;
1106 max->peel_info.dr = elem->dr;
1109 return 1;
1113 /* Traverse peeling hash table and calculate cost for each peeling option.
1114 Find the one with the lowest cost. */
1117 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1118 _vect_peel_extended_info *min)
1120 vect_peel_info elem = *slot;
1121 int save_misalignment, dummy;
1122 unsigned int inside_cost = 0, outside_cost = 0, i;
1123 gimple stmt = DR_STMT (elem->dr);
1124 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1125 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1126 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1127 struct data_reference *dr;
1128 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1129 int single_iter_cost;
1131 prologue_cost_vec.create (2);
1132 body_cost_vec.create (2);
1133 epilogue_cost_vec.create (2);
1135 FOR_EACH_VEC_ELT (datarefs, i, dr)
1137 stmt = DR_STMT (dr);
1138 stmt_info = vinfo_for_stmt (stmt);
1139 /* For interleaving, only the alignment of the first access
1140 matters. */
1141 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1142 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1143 continue;
1145 save_misalignment = DR_MISALIGNMENT (dr);
1146 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1147 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1148 &body_cost_vec);
1149 SET_DR_MISALIGNMENT (dr, save_misalignment);
1152 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1153 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1154 &dummy, single_iter_cost,
1155 &prologue_cost_vec,
1156 &epilogue_cost_vec);
1158 /* Prologue and epilogue costs are added to the target model later.
1159 These costs depend only on the scalar iteration cost, the
1160 number of peeling iterations finally chosen, and the number of
1161 misaligned statements. So discard the information found here. */
1162 prologue_cost_vec.release ();
1163 epilogue_cost_vec.release ();
1165 if (inside_cost < min->inside_cost
1166 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1168 min->inside_cost = inside_cost;
1169 min->outside_cost = outside_cost;
1170 min->body_cost_vec.release ();
1171 min->body_cost_vec = body_cost_vec;
1172 min->peel_info.dr = elem->dr;
1173 min->peel_info.npeel = elem->npeel;
1175 else
1176 body_cost_vec.release ();
1178 return 1;
1182 /* Choose best peeling option by traversing peeling hash table and either
1183 choosing an option with the lowest cost (if cost model is enabled) or the
1184 option that aligns as many accesses as possible. */
1186 static struct data_reference *
1187 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1188 unsigned int *npeel,
1189 stmt_vector_for_cost *body_cost_vec)
1191 struct _vect_peel_extended_info res;
1193 res.peel_info.dr = NULL;
1194 res.body_cost_vec = stmt_vector_for_cost ();
1196 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1198 res.inside_cost = INT_MAX;
1199 res.outside_cost = INT_MAX;
1200 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1201 .traverse <_vect_peel_extended_info *,
1202 vect_peeling_hash_get_lowest_cost> (&res);
1204 else
1206 res.peel_info.count = 0;
1207 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1208 .traverse <_vect_peel_extended_info *,
1209 vect_peeling_hash_get_most_frequent> (&res);
1212 *npeel = res.peel_info.npeel;
1213 *body_cost_vec = res.body_cost_vec;
1214 return res.peel_info.dr;
1218 /* Function vect_enhance_data_refs_alignment
1220 This pass will use loop versioning and loop peeling in order to enhance
1221 the alignment of data references in the loop.
1223 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1224 original loop is to be vectorized. Any other loops that are created by
1225 the transformations performed in this pass - are not supposed to be
1226 vectorized. This restriction will be relaxed.
1228 This pass will require a cost model to guide it whether to apply peeling
1229 or versioning or a combination of the two. For example, the scheme that
1230 intel uses when given a loop with several memory accesses, is as follows:
1231 choose one memory access ('p') which alignment you want to force by doing
1232 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1233 other accesses are not necessarily aligned, or (2) use loop versioning to
1234 generate one loop in which all accesses are aligned, and another loop in
1235 which only 'p' is necessarily aligned.
1237 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1238 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1239 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1241 Devising a cost model is the most critical aspect of this work. It will
1242 guide us on which access to peel for, whether to use loop versioning, how
1243 many versions to create, etc. The cost model will probably consist of
1244 generic considerations as well as target specific considerations (on
1245 powerpc for example, misaligned stores are more painful than misaligned
1246 loads).
1248 Here are the general steps involved in alignment enhancements:
1250 -- original loop, before alignment analysis:
1251 for (i=0; i<N; i++){
1252 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1253 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1256 -- After vect_compute_data_refs_alignment:
1257 for (i=0; i<N; i++){
1258 x = q[i]; # DR_MISALIGNMENT(q) = 3
1259 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1262 -- Possibility 1: we do loop versioning:
1263 if (p is aligned) {
1264 for (i=0; i<N; i++){ # loop 1A
1265 x = q[i]; # DR_MISALIGNMENT(q) = 3
1266 p[i] = y; # DR_MISALIGNMENT(p) = 0
1269 else {
1270 for (i=0; i<N; i++){ # loop 1B
1271 x = q[i]; # DR_MISALIGNMENT(q) = 3
1272 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1276 -- Possibility 2: we do loop peeling:
1277 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1278 x = q[i];
1279 p[i] = y;
1281 for (i = 3; i < N; i++){ # loop 2A
1282 x = q[i]; # DR_MISALIGNMENT(q) = 0
1283 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1286 -- Possibility 3: combination of loop peeling and versioning:
1287 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1288 x = q[i];
1289 p[i] = y;
1291 if (p is aligned) {
1292 for (i = 3; i<N; i++){ # loop 3A
1293 x = q[i]; # DR_MISALIGNMENT(q) = 0
1294 p[i] = y; # DR_MISALIGNMENT(p) = 0
1297 else {
1298 for (i = 3; i<N; i++){ # loop 3B
1299 x = q[i]; # DR_MISALIGNMENT(q) = 0
1300 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1304 These loops are later passed to loop_transform to be vectorized. The
1305 vectorizer will use the alignment information to guide the transformation
1306 (whether to generate regular loads/stores, or with special handling for
1307 misalignment). */
1309 bool
1310 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1312 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1313 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1314 enum dr_alignment_support supportable_dr_alignment;
1315 struct data_reference *dr0 = NULL, *first_store = NULL;
1316 struct data_reference *dr;
1317 unsigned int i, j;
1318 bool do_peeling = false;
1319 bool do_versioning = false;
1320 bool stat;
1321 gimple stmt;
1322 stmt_vec_info stmt_info;
1323 unsigned int npeel = 0;
1324 bool all_misalignments_unknown = true;
1325 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1326 unsigned possible_npeel_number = 1;
1327 tree vectype;
1328 unsigned int nelements, mis, same_align_drs_max = 0;
1329 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1331 if (dump_enabled_p ())
1332 dump_printf_loc (MSG_NOTE, vect_location,
1333 "=== vect_enhance_data_refs_alignment ===\n");
1335 /* While cost model enhancements are expected in the future, the high level
1336 view of the code at this time is as follows:
1338 A) If there is a misaligned access then see if peeling to align
1339 this access can make all data references satisfy
1340 vect_supportable_dr_alignment. If so, update data structures
1341 as needed and return true.
1343 B) If peeling wasn't possible and there is a data reference with an
1344 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1345 then see if loop versioning checks can be used to make all data
1346 references satisfy vect_supportable_dr_alignment. If so, update
1347 data structures as needed and return true.
1349 C) If neither peeling nor versioning were successful then return false if
1350 any data reference does not satisfy vect_supportable_dr_alignment.
1352 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1354 Note, Possibility 3 above (which is peeling and versioning together) is not
1355 being done at this time. */
1357 /* (1) Peeling to force alignment. */
1359 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1360 Considerations:
1361 + How many accesses will become aligned due to the peeling
1362 - How many accesses will become unaligned due to the peeling,
1363 and the cost of misaligned accesses.
1364 - The cost of peeling (the extra runtime checks, the increase
1365 in code size). */
1367 FOR_EACH_VEC_ELT (datarefs, i, dr)
1369 stmt = DR_STMT (dr);
1370 stmt_info = vinfo_for_stmt (stmt);
1372 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1373 continue;
1375 /* For interleaving, only the alignment of the first access
1376 matters. */
1377 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1378 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1379 continue;
1381 /* For invariant accesses there is nothing to enhance. */
1382 if (integer_zerop (DR_STEP (dr)))
1383 continue;
1385 /* Strided loads perform only component accesses, alignment is
1386 irrelevant for them. */
1387 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1388 continue;
1390 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1391 do_peeling = vector_alignment_reachable_p (dr);
1392 if (do_peeling)
1394 if (known_alignment_for_access_p (dr))
1396 unsigned int npeel_tmp;
1397 bool negative = tree_int_cst_compare (DR_STEP (dr),
1398 size_zero_node) < 0;
1400 /* Save info about DR in the hash table. */
1401 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1402 LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
1404 vectype = STMT_VINFO_VECTYPE (stmt_info);
1405 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1406 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1407 TREE_TYPE (DR_REF (dr))));
1408 npeel_tmp = (negative
1409 ? (mis - nelements) : (nelements - mis))
1410 & (nelements - 1);
1412 /* For multiple types, it is possible that the bigger type access
1413 will have more than one peeling option. E.g., a loop with two
1414 types: one of size (vector size / 4), and the other one of
1415 size (vector size / 8). Vectorization factor will 8. If both
1416 access are misaligned by 3, the first one needs one scalar
1417 iteration to be aligned, and the second one needs 5. But the
1418 the first one will be aligned also by peeling 5 scalar
1419 iterations, and in that case both accesses will be aligned.
1420 Hence, except for the immediate peeling amount, we also want
1421 to try to add full vector size, while we don't exceed
1422 vectorization factor.
1423 We do this automtically for cost model, since we calculate cost
1424 for every peeling option. */
1425 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1426 possible_npeel_number = vf /nelements;
1428 /* Handle the aligned case. We may decide to align some other
1429 access, making DR unaligned. */
1430 if (DR_MISALIGNMENT (dr) == 0)
1432 npeel_tmp = 0;
1433 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1434 possible_npeel_number++;
1437 for (j = 0; j < possible_npeel_number; j++)
1439 gcc_assert (npeel_tmp <= vf);
1440 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1441 npeel_tmp += nelements;
1444 all_misalignments_unknown = false;
1445 /* Data-ref that was chosen for the case that all the
1446 misalignments are unknown is not relevant anymore, since we
1447 have a data-ref with known alignment. */
1448 dr0 = NULL;
1450 else
1452 /* If we don't know any misalignment values, we prefer
1453 peeling for data-ref that has the maximum number of data-refs
1454 with the same alignment, unless the target prefers to align
1455 stores over load. */
1456 if (all_misalignments_unknown)
1458 unsigned same_align_drs
1459 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1460 if (!dr0
1461 || same_align_drs_max < same_align_drs)
1463 same_align_drs_max = same_align_drs;
1464 dr0 = dr;
1466 /* For data-refs with the same number of related
1467 accesses prefer the one where the misalign
1468 computation will be invariant in the outermost loop. */
1469 else if (same_align_drs_max == same_align_drs)
1471 struct loop *ivloop0, *ivloop;
1472 ivloop0 = outermost_invariant_loop_for_expr
1473 (loop, DR_BASE_ADDRESS (dr0));
1474 ivloop = outermost_invariant_loop_for_expr
1475 (loop, DR_BASE_ADDRESS (dr));
1476 if ((ivloop && !ivloop0)
1477 || (ivloop && ivloop0
1478 && flow_loop_nested_p (ivloop, ivloop0)))
1479 dr0 = dr;
1482 if (!first_store && DR_IS_WRITE (dr))
1483 first_store = dr;
1486 /* If there are both known and unknown misaligned accesses in the
1487 loop, we choose peeling amount according to the known
1488 accesses. */
1489 if (!supportable_dr_alignment)
1491 dr0 = dr;
1492 if (!first_store && DR_IS_WRITE (dr))
1493 first_store = dr;
1497 else
1499 if (!aligned_access_p (dr))
1501 if (dump_enabled_p ())
1502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1503 "vector alignment may not be reachable\n");
1504 break;
1509 /* Check if we can possibly peel the loop. */
1510 if (!vect_can_advance_ivs_p (loop_vinfo)
1511 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1512 do_peeling = false;
1514 if (do_peeling && all_misalignments_unknown
1515 && vect_supportable_dr_alignment (dr0, false))
1518 /* Check if the target requires to prefer stores over loads, i.e., if
1519 misaligned stores are more expensive than misaligned loads (taking
1520 drs with same alignment into account). */
1521 if (first_store && DR_IS_READ (dr0))
1523 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1524 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1525 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1526 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1527 stmt_vector_for_cost dummy;
1528 dummy.create (2);
1530 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1531 &dummy);
1532 vect_get_data_access_cost (first_store, &store_inside_cost,
1533 &store_outside_cost, &dummy);
1535 dummy.release ();
1537 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1538 aligning the load DR0). */
1539 load_inside_penalty = store_inside_cost;
1540 load_outside_penalty = store_outside_cost;
1541 for (i = 0;
1542 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1543 DR_STMT (first_store))).iterate (i, &dr);
1544 i++)
1545 if (DR_IS_READ (dr))
1547 load_inside_penalty += load_inside_cost;
1548 load_outside_penalty += load_outside_cost;
1550 else
1552 load_inside_penalty += store_inside_cost;
1553 load_outside_penalty += store_outside_cost;
1556 /* Calculate the penalty for leaving DR0 unaligned (by
1557 aligning the FIRST_STORE). */
1558 store_inside_penalty = load_inside_cost;
1559 store_outside_penalty = load_outside_cost;
1560 for (i = 0;
1561 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1562 DR_STMT (dr0))).iterate (i, &dr);
1563 i++)
1564 if (DR_IS_READ (dr))
1566 store_inside_penalty += load_inside_cost;
1567 store_outside_penalty += load_outside_cost;
1569 else
1571 store_inside_penalty += store_inside_cost;
1572 store_outside_penalty += store_outside_cost;
1575 if (load_inside_penalty > store_inside_penalty
1576 || (load_inside_penalty == store_inside_penalty
1577 && load_outside_penalty > store_outside_penalty))
1578 dr0 = first_store;
1581 /* In case there are only loads with different unknown misalignments, use
1582 peeling only if it may help to align other accesses in the loop. */
1583 if (!first_store
1584 && !STMT_VINFO_SAME_ALIGN_REFS (
1585 vinfo_for_stmt (DR_STMT (dr0))).length ()
1586 && vect_supportable_dr_alignment (dr0, false)
1587 != dr_unaligned_supported)
1588 do_peeling = false;
1591 if (do_peeling && !dr0)
1593 /* Peeling is possible, but there is no data access that is not supported
1594 unless aligned. So we try to choose the best possible peeling. */
1596 /* We should get here only if there are drs with known misalignment. */
1597 gcc_assert (!all_misalignments_unknown);
1599 /* Choose the best peeling from the hash table. */
1600 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1601 &body_cost_vec);
1602 if (!dr0 || !npeel)
1603 do_peeling = false;
1606 if (do_peeling)
1608 stmt = DR_STMT (dr0);
1609 stmt_info = vinfo_for_stmt (stmt);
1610 vectype = STMT_VINFO_VECTYPE (stmt_info);
1611 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1613 if (known_alignment_for_access_p (dr0))
1615 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1616 size_zero_node) < 0;
1617 if (!npeel)
1619 /* Since it's known at compile time, compute the number of
1620 iterations in the peeled loop (the peeling factor) for use in
1621 updating DR_MISALIGNMENT values. The peeling factor is the
1622 vectorization factor minus the misalignment as an element
1623 count. */
1624 mis = DR_MISALIGNMENT (dr0);
1625 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1626 npeel = ((negative ? mis - nelements : nelements - mis)
1627 & (nelements - 1));
1630 /* For interleaved data access every iteration accesses all the
1631 members of the group, therefore we divide the number of iterations
1632 by the group size. */
1633 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1634 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1635 npeel /= GROUP_SIZE (stmt_info);
1637 if (dump_enabled_p ())
1638 dump_printf_loc (MSG_NOTE, vect_location,
1639 "Try peeling by %d\n", npeel);
1642 /* Ensure that all data refs can be vectorized after the peel. */
1643 FOR_EACH_VEC_ELT (datarefs, i, dr)
1645 int save_misalignment;
1647 if (dr == dr0)
1648 continue;
1650 stmt = DR_STMT (dr);
1651 stmt_info = vinfo_for_stmt (stmt);
1652 /* For interleaving, only the alignment of the first access
1653 matters. */
1654 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1655 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1656 continue;
1658 /* Strided loads perform only component accesses, alignment is
1659 irrelevant for them. */
1660 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1661 continue;
1663 save_misalignment = DR_MISALIGNMENT (dr);
1664 vect_update_misalignment_for_peel (dr, dr0, npeel);
1665 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1666 SET_DR_MISALIGNMENT (dr, save_misalignment);
1668 if (!supportable_dr_alignment)
1670 do_peeling = false;
1671 break;
1675 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1677 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1678 if (!stat)
1679 do_peeling = false;
1680 else
1682 body_cost_vec.release ();
1683 return stat;
1687 if (do_peeling)
1689 unsigned max_allowed_peel
1690 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1691 if (max_allowed_peel != (unsigned)-1)
1693 unsigned max_peel = npeel;
1694 if (max_peel == 0)
1696 gimple dr_stmt = DR_STMT (dr0);
1697 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1698 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1699 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1701 if (max_peel > max_allowed_peel)
1703 do_peeling = false;
1704 if (dump_enabled_p ())
1705 dump_printf_loc (MSG_NOTE, vect_location,
1706 "Disable peeling, max peels reached: %d\n", max_peel);
1711 if (do_peeling)
1713 stmt_info_for_cost *si;
1714 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1716 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1717 If the misalignment of DR_i is identical to that of dr0 then set
1718 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1719 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1720 by the peeling factor times the element size of DR_i (MOD the
1721 vectorization factor times the size). Otherwise, the
1722 misalignment of DR_i must be set to unknown. */
1723 FOR_EACH_VEC_ELT (datarefs, i, dr)
1724 if (dr != dr0)
1725 vect_update_misalignment_for_peel (dr, dr0, npeel);
1727 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1728 if (npeel)
1729 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1730 else
1731 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1732 = DR_MISALIGNMENT (dr0);
1733 SET_DR_MISALIGNMENT (dr0, 0);
1734 if (dump_enabled_p ())
1736 dump_printf_loc (MSG_NOTE, vect_location,
1737 "Alignment of access forced using peeling.\n");
1738 dump_printf_loc (MSG_NOTE, vect_location,
1739 "Peeling for alignment will be applied.\n");
1741 /* We've delayed passing the inside-loop peeling costs to the
1742 target cost model until we were sure peeling would happen.
1743 Do so now. */
1744 if (body_cost_vec.exists ())
1746 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1748 struct _stmt_vec_info *stmt_info
1749 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1750 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1751 si->misalign, vect_body);
1753 body_cost_vec.release ();
1756 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1757 gcc_assert (stat);
1758 return stat;
1762 body_cost_vec.release ();
1764 /* (2) Versioning to force alignment. */
1766 /* Try versioning if:
1767 1) optimize loop for speed
1768 2) there is at least one unsupported misaligned data ref with an unknown
1769 misalignment, and
1770 3) all misaligned data refs with a known misalignment are supported, and
1771 4) the number of runtime alignment checks is within reason. */
1773 do_versioning =
1774 optimize_loop_nest_for_speed_p (loop)
1775 && (!loop->inner); /* FORNOW */
1777 if (do_versioning)
1779 FOR_EACH_VEC_ELT (datarefs, i, dr)
1781 stmt = DR_STMT (dr);
1782 stmt_info = vinfo_for_stmt (stmt);
1784 /* For interleaving, only the alignment of the first access
1785 matters. */
1786 if (aligned_access_p (dr)
1787 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1788 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1789 continue;
1791 /* Strided loads perform only component accesses, alignment is
1792 irrelevant for them. */
1793 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1794 continue;
1796 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1798 if (!supportable_dr_alignment)
1800 gimple stmt;
1801 int mask;
1802 tree vectype;
1804 if (known_alignment_for_access_p (dr)
1805 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1806 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1808 do_versioning = false;
1809 break;
1812 stmt = DR_STMT (dr);
1813 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1814 gcc_assert (vectype);
1816 /* The rightmost bits of an aligned address must be zeros.
1817 Construct the mask needed for this test. For example,
1818 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1819 mask must be 15 = 0xf. */
1820 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1822 /* FORNOW: use the same mask to test all potentially unaligned
1823 references in the loop. The vectorizer currently supports
1824 a single vector size, see the reference to
1825 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1826 vectorization factor is computed. */
1827 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1828 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1829 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1830 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1831 DR_STMT (dr));
1835 /* Versioning requires at least one misaligned data reference. */
1836 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1837 do_versioning = false;
1838 else if (!do_versioning)
1839 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1842 if (do_versioning)
1844 vec<gimple> may_misalign_stmts
1845 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1846 gimple stmt;
1848 /* It can now be assumed that the data references in the statements
1849 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1850 of the loop being vectorized. */
1851 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1853 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1854 dr = STMT_VINFO_DATA_REF (stmt_info);
1855 SET_DR_MISALIGNMENT (dr, 0);
1856 if (dump_enabled_p ())
1857 dump_printf_loc (MSG_NOTE, vect_location,
1858 "Alignment of access forced using versioning.\n");
1861 if (dump_enabled_p ())
1862 dump_printf_loc (MSG_NOTE, vect_location,
1863 "Versioning for alignment will be applied.\n");
1865 /* Peeling and versioning can't be done together at this time. */
1866 gcc_assert (! (do_peeling && do_versioning));
1868 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1869 gcc_assert (stat);
1870 return stat;
1873 /* This point is reached if neither peeling nor versioning is being done. */
1874 gcc_assert (! (do_peeling || do_versioning));
1876 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1877 return stat;
1881 /* Function vect_find_same_alignment_drs.
1883 Update group and alignment relations according to the chosen
1884 vectorization factor. */
1886 static void
1887 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1888 loop_vec_info loop_vinfo)
1890 unsigned int i;
1891 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1892 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1893 struct data_reference *dra = DDR_A (ddr);
1894 struct data_reference *drb = DDR_B (ddr);
1895 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1896 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1897 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1898 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1899 lambda_vector dist_v;
1900 unsigned int loop_depth;
1902 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1903 return;
1905 if (dra == drb)
1906 return;
1908 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1909 return;
1911 /* Loop-based vectorization and known data dependence. */
1912 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1913 return;
1915 /* Data-dependence analysis reports a distance vector of zero
1916 for data-references that overlap only in the first iteration
1917 but have different sign step (see PR45764).
1918 So as a sanity check require equal DR_STEP. */
1919 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1920 return;
1922 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1923 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1925 int dist = dist_v[loop_depth];
1927 if (dump_enabled_p ())
1928 dump_printf_loc (MSG_NOTE, vect_location,
1929 "dependence distance = %d.\n", dist);
1931 /* Same loop iteration. */
1932 if (dist == 0
1933 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1935 /* Two references with distance zero have the same alignment. */
1936 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1937 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1938 if (dump_enabled_p ())
1940 dump_printf_loc (MSG_NOTE, vect_location,
1941 "accesses have the same alignment.\n");
1942 dump_printf (MSG_NOTE,
1943 "dependence distance modulo vf == 0 between ");
1944 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1945 dump_printf (MSG_NOTE, " and ");
1946 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1947 dump_printf (MSG_NOTE, "\n");
1954 /* Function vect_analyze_data_refs_alignment
1956 Analyze the alignment of the data-references in the loop.
1957 Return FALSE if a data reference is found that cannot be vectorized. */
1959 bool
1960 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1961 bb_vec_info bb_vinfo)
1963 if (dump_enabled_p ())
1964 dump_printf_loc (MSG_NOTE, vect_location,
1965 "=== vect_analyze_data_refs_alignment ===\n");
1967 /* Mark groups of data references with same alignment using
1968 data dependence information. */
1969 if (loop_vinfo)
1971 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1972 struct data_dependence_relation *ddr;
1973 unsigned int i;
1975 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1976 vect_find_same_alignment_drs (ddr, loop_vinfo);
1979 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1981 if (dump_enabled_p ())
1982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1983 "not vectorized: can't calculate alignment "
1984 "for data ref.\n");
1985 return false;
1988 return true;
1992 /* Analyze groups of accesses: check that DR belongs to a group of
1993 accesses of legal size, step, etc. Detect gaps, single element
1994 interleaving, and other special cases. Set grouped access info.
1995 Collect groups of strided stores for further use in SLP analysis. */
1997 static bool
1998 vect_analyze_group_access (struct data_reference *dr)
2000 tree step = DR_STEP (dr);
2001 tree scalar_type = TREE_TYPE (DR_REF (dr));
2002 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2003 gimple stmt = DR_STMT (dr);
2004 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2005 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2006 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2007 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2008 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2009 bool slp_impossible = false;
2010 struct loop *loop = NULL;
2012 if (loop_vinfo)
2013 loop = LOOP_VINFO_LOOP (loop_vinfo);
2015 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2016 size of the interleaving group (including gaps). */
2017 groupsize = absu_hwi (dr_step) / type_size;
2019 /* Not consecutive access is possible only if it is a part of interleaving. */
2020 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2022 /* Check if it this DR is a part of interleaving, and is a single
2023 element of the group that is accessed in the loop. */
2025 /* Gaps are supported only for loads. STEP must be a multiple of the type
2026 size. The size of the group must be a power of 2. */
2027 if (DR_IS_READ (dr)
2028 && (dr_step % type_size) == 0
2029 && groupsize > 0
2030 && exact_log2 (groupsize) != -1)
2032 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2033 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2034 if (dump_enabled_p ())
2036 dump_printf_loc (MSG_NOTE, vect_location,
2037 "Detected single element interleaving ");
2038 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2039 dump_printf (MSG_NOTE, " step ");
2040 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2041 dump_printf (MSG_NOTE, "\n");
2044 if (loop_vinfo)
2046 if (dump_enabled_p ())
2047 dump_printf_loc (MSG_NOTE, vect_location,
2048 "Data access with gaps requires scalar "
2049 "epilogue loop\n");
2050 if (loop->inner)
2052 if (dump_enabled_p ())
2053 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2054 "Peeling for outer loop is not"
2055 " supported\n");
2056 return false;
2059 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2062 return true;
2065 if (dump_enabled_p ())
2067 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2068 "not consecutive access ");
2069 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2070 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2073 if (bb_vinfo)
2075 /* Mark the statement as unvectorizable. */
2076 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2077 return true;
2080 return false;
2083 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2085 /* First stmt in the interleaving chain. Check the chain. */
2086 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2087 struct data_reference *data_ref = dr;
2088 unsigned int count = 1;
2089 tree prev_init = DR_INIT (data_ref);
2090 gimple prev = stmt;
2091 HOST_WIDE_INT diff, gaps = 0;
2092 unsigned HOST_WIDE_INT count_in_bytes;
2094 while (next)
2096 /* Skip same data-refs. In case that two or more stmts share
2097 data-ref (supported only for loads), we vectorize only the first
2098 stmt, and the rest get their vectorized loads from the first
2099 one. */
2100 if (!tree_int_cst_compare (DR_INIT (data_ref),
2101 DR_INIT (STMT_VINFO_DATA_REF (
2102 vinfo_for_stmt (next)))))
2104 if (DR_IS_WRITE (data_ref))
2106 if (dump_enabled_p ())
2107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2108 "Two store stmts share the same dr.\n");
2109 return false;
2112 /* For load use the same data-ref load. */
2113 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2115 prev = next;
2116 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2117 continue;
2120 prev = next;
2121 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2123 /* All group members have the same STEP by construction. */
2124 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2126 /* Check that the distance between two accesses is equal to the type
2127 size. Otherwise, we have gaps. */
2128 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2129 - TREE_INT_CST_LOW (prev_init)) / type_size;
2130 if (diff != 1)
2132 /* FORNOW: SLP of accesses with gaps is not supported. */
2133 slp_impossible = true;
2134 if (DR_IS_WRITE (data_ref))
2136 if (dump_enabled_p ())
2137 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2138 "interleaved store with gaps\n");
2139 return false;
2142 gaps += diff - 1;
2145 last_accessed_element += diff;
2147 /* Store the gap from the previous member of the group. If there is no
2148 gap in the access, GROUP_GAP is always 1. */
2149 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2151 prev_init = DR_INIT (data_ref);
2152 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2153 /* Count the number of data-refs in the chain. */
2154 count++;
2157 /* COUNT is the number of accesses found, we multiply it by the size of
2158 the type to get COUNT_IN_BYTES. */
2159 count_in_bytes = type_size * count;
2161 /* Check that the size of the interleaving (including gaps) is not
2162 greater than STEP. */
2163 if (dr_step != 0
2164 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2166 if (dump_enabled_p ())
2168 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2169 "interleaving size is greater than step for ");
2170 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2171 DR_REF (dr));
2172 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2174 return false;
2177 /* Check that the size of the interleaving is equal to STEP for stores,
2178 i.e., that there are no gaps. */
2179 if (dr_step != 0
2180 && absu_hwi (dr_step) != count_in_bytes)
2182 if (DR_IS_READ (dr))
2184 slp_impossible = true;
2185 /* There is a gap after the last load in the group. This gap is a
2186 difference between the groupsize and the number of elements.
2187 When there is no gap, this difference should be 0. */
2188 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2190 else
2192 if (dump_enabled_p ())
2193 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2194 "interleaved store with gaps\n");
2195 return false;
2199 /* Check that STEP is a multiple of type size. */
2200 if (dr_step != 0
2201 && (dr_step % type_size) != 0)
2203 if (dump_enabled_p ())
2205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2206 "step is not a multiple of type size: step ");
2207 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2208 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2209 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2210 TYPE_SIZE_UNIT (scalar_type));
2211 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2213 return false;
2216 if (groupsize == 0)
2217 groupsize = count;
2219 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2220 if (dump_enabled_p ())
2221 dump_printf_loc (MSG_NOTE, vect_location,
2222 "Detected interleaving of size %d\n", (int)groupsize);
2224 /* SLP: create an SLP data structure for every interleaving group of
2225 stores for further analysis in vect_analyse_slp. */
2226 if (DR_IS_WRITE (dr) && !slp_impossible)
2228 if (loop_vinfo)
2229 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2230 if (bb_vinfo)
2231 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2234 /* There is a gap in the end of the group. */
2235 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2237 if (dump_enabled_p ())
2238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2239 "Data access with gaps requires scalar "
2240 "epilogue loop\n");
2241 if (loop->inner)
2243 if (dump_enabled_p ())
2244 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2245 "Peeling for outer loop is not supported\n");
2246 return false;
2249 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2253 return true;
2257 /* Analyze the access pattern of the data-reference DR.
2258 In case of non-consecutive accesses call vect_analyze_group_access() to
2259 analyze groups of accesses. */
2261 static bool
2262 vect_analyze_data_ref_access (struct data_reference *dr)
2264 tree step = DR_STEP (dr);
2265 tree scalar_type = TREE_TYPE (DR_REF (dr));
2266 gimple stmt = DR_STMT (dr);
2267 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2268 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2269 struct loop *loop = NULL;
2271 if (loop_vinfo)
2272 loop = LOOP_VINFO_LOOP (loop_vinfo);
2274 if (loop_vinfo && !step)
2276 if (dump_enabled_p ())
2277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2278 "bad data-ref access in loop\n");
2279 return false;
2282 /* Allow invariant loads in not nested loops. */
2283 if (loop_vinfo && integer_zerop (step))
2285 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2286 if (nested_in_vect_loop_p (loop, stmt))
2288 if (dump_enabled_p ())
2289 dump_printf_loc (MSG_NOTE, vect_location,
2290 "zero step in inner loop of nest\n");
2291 return false;
2293 return DR_IS_READ (dr);
2296 if (loop && nested_in_vect_loop_p (loop, stmt))
2298 /* Interleaved accesses are not yet supported within outer-loop
2299 vectorization for references in the inner-loop. */
2300 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2302 /* For the rest of the analysis we use the outer-loop step. */
2303 step = STMT_VINFO_DR_STEP (stmt_info);
2304 if (integer_zerop (step))
2306 if (dump_enabled_p ())
2307 dump_printf_loc (MSG_NOTE, vect_location,
2308 "zero step in outer loop.\n");
2309 if (DR_IS_READ (dr))
2310 return true;
2311 else
2312 return false;
2316 /* Consecutive? */
2317 if (TREE_CODE (step) == INTEGER_CST)
2319 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2320 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2321 || (dr_step < 0
2322 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2324 /* Mark that it is not interleaving. */
2325 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2326 return true;
2330 if (loop && nested_in_vect_loop_p (loop, stmt))
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_NOTE, vect_location,
2334 "grouped access in outer loop.\n");
2335 return false;
2338 /* Assume this is a DR handled by non-constant strided load case. */
2339 if (TREE_CODE (step) != INTEGER_CST)
2340 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2342 /* Not consecutive access - check if it's a part of interleaving group. */
2343 return vect_analyze_group_access (dr);
2348 /* A helper function used in the comparator function to sort data
2349 references. T1 and T2 are two data references to be compared.
2350 The function returns -1, 0, or 1. */
2352 static int
2353 compare_tree (tree t1, tree t2)
2355 int i, cmp;
2356 enum tree_code code;
2357 char tclass;
2359 if (t1 == t2)
2360 return 0;
2361 if (t1 == NULL)
2362 return -1;
2363 if (t2 == NULL)
2364 return 1;
2367 if (TREE_CODE (t1) != TREE_CODE (t2))
2368 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2370 code = TREE_CODE (t1);
2371 switch (code)
2373 /* For const values, we can just use hash values for comparisons. */
2374 case INTEGER_CST:
2375 case REAL_CST:
2376 case FIXED_CST:
2377 case STRING_CST:
2378 case COMPLEX_CST:
2379 case VECTOR_CST:
2381 hashval_t h1 = iterative_hash_expr (t1, 0);
2382 hashval_t h2 = iterative_hash_expr (t2, 0);
2383 if (h1 != h2)
2384 return h1 < h2 ? -1 : 1;
2385 break;
2388 case SSA_NAME:
2389 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2390 if (cmp != 0)
2391 return cmp;
2393 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2394 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2395 break;
2397 default:
2398 tclass = TREE_CODE_CLASS (code);
2400 /* For var-decl, we could compare their UIDs. */
2401 if (tclass == tcc_declaration)
2403 if (DECL_UID (t1) != DECL_UID (t2))
2404 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2405 break;
2408 /* For expressions with operands, compare their operands recursively. */
2409 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2411 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2412 if (cmp != 0)
2413 return cmp;
2417 return 0;
2421 /* Compare two data-references DRA and DRB to group them into chunks
2422 suitable for grouping. */
2424 static int
2425 dr_group_sort_cmp (const void *dra_, const void *drb_)
2427 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2428 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2429 int cmp;
2431 /* Stabilize sort. */
2432 if (dra == drb)
2433 return 0;
2435 /* Ordering of DRs according to base. */
2436 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2438 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2439 if (cmp != 0)
2440 return cmp;
2443 /* And according to DR_OFFSET. */
2444 if (!dr_equal_offsets_p (dra, drb))
2446 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2447 if (cmp != 0)
2448 return cmp;
2451 /* Put reads before writes. */
2452 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2453 return DR_IS_READ (dra) ? -1 : 1;
2455 /* Then sort after access size. */
2456 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2457 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2459 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2460 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2461 if (cmp != 0)
2462 return cmp;
2465 /* And after step. */
2466 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2468 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2469 if (cmp != 0)
2470 return cmp;
2473 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2474 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2475 if (cmp == 0)
2476 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2477 return cmp;
2480 /* Function vect_analyze_data_ref_accesses.
2482 Analyze the access pattern of all the data references in the loop.
2484 FORNOW: the only access pattern that is considered vectorizable is a
2485 simple step 1 (consecutive) access.
2487 FORNOW: handle only arrays and pointer accesses. */
2489 bool
2490 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2492 unsigned int i;
2493 vec<data_reference_p> datarefs;
2494 struct data_reference *dr;
2496 if (dump_enabled_p ())
2497 dump_printf_loc (MSG_NOTE, vect_location,
2498 "=== vect_analyze_data_ref_accesses ===\n");
2500 if (loop_vinfo)
2501 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2502 else
2503 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2505 if (datarefs.is_empty ())
2506 return true;
2508 /* Sort the array of datarefs to make building the interleaving chains
2509 linear. Don't modify the original vector's order, it is needed for
2510 determining what dependencies are reversed. */
2511 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2512 qsort (datarefs_copy.address (), datarefs_copy.length (),
2513 sizeof (data_reference_p), dr_group_sort_cmp);
2515 /* Build the interleaving chains. */
2516 for (i = 0; i < datarefs_copy.length () - 1;)
2518 data_reference_p dra = datarefs_copy[i];
2519 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2520 stmt_vec_info lastinfo = NULL;
2521 for (i = i + 1; i < datarefs_copy.length (); ++i)
2523 data_reference_p drb = datarefs_copy[i];
2524 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2526 /* ??? Imperfect sorting (non-compatible types, non-modulo
2527 accesses, same accesses) can lead to a group to be artificially
2528 split here as we don't just skip over those. If it really
2529 matters we can push those to a worklist and re-iterate
2530 over them. The we can just skip ahead to the next DR here. */
2532 /* Check that the data-refs have same first location (except init)
2533 and they are both either store or load (not load and store,
2534 not masked loads or stores). */
2535 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2536 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2537 DR_BASE_ADDRESS (drb), 0)
2538 || !dr_equal_offsets_p (dra, drb)
2539 || !gimple_assign_single_p (DR_STMT (dra))
2540 || !gimple_assign_single_p (DR_STMT (drb)))
2541 break;
2543 /* Check that the data-refs have the same constant size and step. */
2544 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2545 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2546 if (!tree_fits_uhwi_p (sza)
2547 || !tree_fits_uhwi_p (szb)
2548 || !tree_int_cst_equal (sza, szb)
2549 || !tree_fits_shwi_p (DR_STEP (dra))
2550 || !tree_fits_shwi_p (DR_STEP (drb))
2551 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2552 break;
2554 /* Do not place the same access in the interleaving chain twice. */
2555 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2556 break;
2558 /* Check the types are compatible.
2559 ??? We don't distinguish this during sorting. */
2560 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2561 TREE_TYPE (DR_REF (drb))))
2562 break;
2564 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2565 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2566 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2567 gcc_assert (init_a < init_b);
2569 /* If init_b == init_a + the size of the type * k, we have an
2570 interleaving, and DRA is accessed before DRB. */
2571 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2572 if ((init_b - init_a) % type_size_a != 0)
2573 break;
2575 /* The step (if not zero) is greater than the difference between
2576 data-refs' inits. This splits groups into suitable sizes. */
2577 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2578 if (step != 0 && step <= (init_b - init_a))
2579 break;
2581 if (dump_enabled_p ())
2583 dump_printf_loc (MSG_NOTE, vect_location,
2584 "Detected interleaving ");
2585 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2586 dump_printf (MSG_NOTE, " and ");
2587 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2588 dump_printf (MSG_NOTE, "\n");
2591 /* Link the found element into the group list. */
2592 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2594 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2595 lastinfo = stmtinfo_a;
2597 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2598 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2599 lastinfo = stmtinfo_b;
2603 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2604 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2605 && !vect_analyze_data_ref_access (dr))
2607 if (dump_enabled_p ())
2608 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2609 "not vectorized: complicated access pattern.\n");
2611 if (bb_vinfo)
2613 /* Mark the statement as not vectorizable. */
2614 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2615 continue;
2617 else
2619 datarefs_copy.release ();
2620 return false;
2624 datarefs_copy.release ();
2625 return true;
2629 /* Operator == between two dr_with_seg_len objects.
2631 This equality operator is used to make sure two data refs
2632 are the same one so that we will consider to combine the
2633 aliasing checks of those two pairs of data dependent data
2634 refs. */
2636 static bool
2637 operator == (const dr_with_seg_len& d1,
2638 const dr_with_seg_len& d2)
2640 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2641 DR_BASE_ADDRESS (d2.dr), 0)
2642 && compare_tree (d1.offset, d2.offset) == 0
2643 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2646 /* Function comp_dr_with_seg_len_pair.
2648 Comparison function for sorting objects of dr_with_seg_len_pair_t
2649 so that we can combine aliasing checks in one scan. */
2651 static int
2652 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2654 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2655 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2657 const dr_with_seg_len &p11 = p1->first,
2658 &p12 = p1->second,
2659 &p21 = p2->first,
2660 &p22 = p2->second;
2662 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2663 if a and c have the same basic address snd step, and b and d have the same
2664 address and step. Therefore, if any a&c or b&d don't have the same address
2665 and step, we don't care the order of those two pairs after sorting. */
2666 int comp_res;
2668 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2669 DR_BASE_ADDRESS (p21.dr))) != 0)
2670 return comp_res;
2671 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2672 DR_BASE_ADDRESS (p22.dr))) != 0)
2673 return comp_res;
2674 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2675 return comp_res;
2676 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2677 return comp_res;
2678 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2679 return comp_res;
2680 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2681 return comp_res;
2683 return 0;
2686 template <class T> static void
2687 swap (T& a, T& b)
2689 T c (a);
2690 a = b;
2691 b = c;
2694 /* Function vect_vfa_segment_size.
2696 Create an expression that computes the size of segment
2697 that will be accessed for a data reference. The functions takes into
2698 account that realignment loads may access one more vector.
2700 Input:
2701 DR: The data reference.
2702 LENGTH_FACTOR: segment length to consider.
2704 Return an expression whose value is the size of segment which will be
2705 accessed by DR. */
2707 static tree
2708 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2710 tree segment_length;
2712 if (integer_zerop (DR_STEP (dr)))
2713 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2714 else
2715 segment_length = size_binop (MULT_EXPR,
2716 fold_convert (sizetype, DR_STEP (dr)),
2717 fold_convert (sizetype, length_factor));
2719 if (vect_supportable_dr_alignment (dr, false)
2720 == dr_explicit_realign_optimized)
2722 tree vector_size = TYPE_SIZE_UNIT
2723 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2725 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2727 return segment_length;
2730 /* Function vect_prune_runtime_alias_test_list.
2732 Prune a list of ddrs to be tested at run-time by versioning for alias.
2733 Merge several alias checks into one if possible.
2734 Return FALSE if resulting list of ddrs is longer then allowed by
2735 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2737 bool
2738 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2740 vec<ddr_p> may_alias_ddrs =
2741 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2742 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2743 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2744 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2745 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2747 ddr_p ddr;
2748 unsigned int i;
2749 tree length_factor;
2751 if (dump_enabled_p ())
2752 dump_printf_loc (MSG_NOTE, vect_location,
2753 "=== vect_prune_runtime_alias_test_list ===\n");
2755 if (may_alias_ddrs.is_empty ())
2756 return true;
2758 /* Basically, for each pair of dependent data refs store_ptr_0
2759 and load_ptr_0, we create an expression:
2761 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2762 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2764 for aliasing checks. However, in some cases we can decrease
2765 the number of checks by combining two checks into one. For
2766 example, suppose we have another pair of data refs store_ptr_0
2767 and load_ptr_1, and if the following condition is satisfied:
2769 load_ptr_0 < load_ptr_1 &&
2770 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2772 (this condition means, in each iteration of vectorized loop,
2773 the accessed memory of store_ptr_0 cannot be between the memory
2774 of load_ptr_0 and load_ptr_1.)
2776 we then can use only the following expression to finish the
2777 alising checks between store_ptr_0 & load_ptr_0 and
2778 store_ptr_0 & load_ptr_1:
2780 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2781 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2783 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2784 same basic address. */
2786 comp_alias_ddrs.create (may_alias_ddrs.length ());
2788 /* First, we collect all data ref pairs for aliasing checks. */
2789 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2791 struct data_reference *dr_a, *dr_b;
2792 gimple dr_group_first_a, dr_group_first_b;
2793 tree segment_length_a, segment_length_b;
2794 gimple stmt_a, stmt_b;
2796 dr_a = DDR_A (ddr);
2797 stmt_a = DR_STMT (DDR_A (ddr));
2798 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2799 if (dr_group_first_a)
2801 stmt_a = dr_group_first_a;
2802 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2805 dr_b = DDR_B (ddr);
2806 stmt_b = DR_STMT (DDR_B (ddr));
2807 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2808 if (dr_group_first_b)
2810 stmt_b = dr_group_first_b;
2811 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2814 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2815 length_factor = scalar_loop_iters;
2816 else
2817 length_factor = size_int (vect_factor);
2818 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2819 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2821 dr_with_seg_len_pair_t dr_with_seg_len_pair
2822 (dr_with_seg_len (dr_a, segment_length_a),
2823 dr_with_seg_len (dr_b, segment_length_b));
2825 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2826 swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2828 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2831 /* Second, we sort the collected data ref pairs so that we can scan
2832 them once to combine all possible aliasing checks. */
2833 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2835 /* Third, we scan the sorted dr pairs and check if we can combine
2836 alias checks of two neighbouring dr pairs. */
2837 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2839 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2840 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2841 *dr_b1 = &comp_alias_ddrs[i-1].second,
2842 *dr_a2 = &comp_alias_ddrs[i].first,
2843 *dr_b2 = &comp_alias_ddrs[i].second;
2845 /* Remove duplicate data ref pairs. */
2846 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2848 if (dump_enabled_p ())
2850 dump_printf_loc (MSG_NOTE, vect_location,
2851 "found equal ranges ");
2852 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2853 DR_REF (dr_a1->dr));
2854 dump_printf (MSG_NOTE, ", ");
2855 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2856 DR_REF (dr_b1->dr));
2857 dump_printf (MSG_NOTE, " and ");
2858 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2859 DR_REF (dr_a2->dr));
2860 dump_printf (MSG_NOTE, ", ");
2861 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2862 DR_REF (dr_b2->dr));
2863 dump_printf (MSG_NOTE, "\n");
2866 comp_alias_ddrs.ordered_remove (i--);
2867 continue;
2870 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2872 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2873 and DR_A1 and DR_A2 are two consecutive memrefs. */
2874 if (*dr_a1 == *dr_a2)
2876 swap (dr_a1, dr_b1);
2877 swap (dr_a2, dr_b2);
2880 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2881 DR_BASE_ADDRESS (dr_a2->dr),
2883 || !tree_fits_shwi_p (dr_a1->offset)
2884 || !tree_fits_shwi_p (dr_a2->offset))
2885 continue;
2887 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2888 - tree_to_shwi (dr_a1->offset));
2891 /* Now we check if the following condition is satisfied:
2893 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2895 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2896 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2897 have to make a best estimation. We can get the minimum value
2898 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2899 then either of the following two conditions can guarantee the
2900 one above:
2902 1: DIFF <= MIN_SEG_LEN_B
2903 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2907 HOST_WIDE_INT
2908 min_seg_len_b = (TREE_CODE (dr_b1->seg_len) == INTEGER_CST) ?
2909 TREE_INT_CST_LOW (dr_b1->seg_len) :
2910 vect_factor;
2912 if (diff <= min_seg_len_b
2913 || (TREE_CODE (dr_a1->seg_len) == INTEGER_CST
2914 && diff - (HOST_WIDE_INT) TREE_INT_CST_LOW (dr_a1->seg_len) <
2915 min_seg_len_b))
2917 if (dump_enabled_p ())
2919 dump_printf_loc (MSG_NOTE, vect_location,
2920 "merging ranges for ");
2921 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2922 DR_REF (dr_a1->dr));
2923 dump_printf (MSG_NOTE, ", ");
2924 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2925 DR_REF (dr_b1->dr));
2926 dump_printf (MSG_NOTE, " and ");
2927 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2928 DR_REF (dr_a2->dr));
2929 dump_printf (MSG_NOTE, ", ");
2930 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2931 DR_REF (dr_b2->dr));
2932 dump_printf (MSG_NOTE, "\n");
2935 dr_a1->seg_len = size_binop (PLUS_EXPR,
2936 dr_a2->seg_len, size_int (diff));
2937 comp_alias_ddrs.ordered_remove (i--);
2942 dump_printf_loc (MSG_NOTE, vect_location,
2943 "improved number of alias checks from %d to %d\n",
2944 may_alias_ddrs.length (), comp_alias_ddrs.length ());
2945 if ((int) comp_alias_ddrs.length () >
2946 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2947 return false;
2949 return true;
2952 /* Check whether a non-affine read in stmt is suitable for gather load
2953 and if so, return a builtin decl for that operation. */
2955 tree
2956 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2957 tree *offp, int *scalep)
2959 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2960 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2961 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2962 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2963 tree offtype = NULL_TREE;
2964 tree decl, base, off;
2965 enum machine_mode pmode;
2966 int punsignedp, pvolatilep;
2968 base = DR_REF (dr);
2969 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2970 see if we can use the def stmt of the address. */
2971 if (is_gimple_call (stmt)
2972 && gimple_call_internal_p (stmt)
2973 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2974 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2975 && TREE_CODE (base) == MEM_REF
2976 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2977 && integer_zerop (TREE_OPERAND (base, 1))
2978 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2980 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2981 if (is_gimple_assign (def_stmt)
2982 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
2983 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2986 /* The gather builtins need address of the form
2987 loop_invariant + vector * {1, 2, 4, 8}
2989 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2990 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2991 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2992 multiplications and additions in it. To get a vector, we need
2993 a single SSA_NAME that will be defined in the loop and will
2994 contain everything that is not loop invariant and that can be
2995 vectorized. The following code attempts to find such a preexistng
2996 SSA_NAME OFF and put the loop invariants into a tree BASE
2997 that can be gimplified before the loop. */
2998 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
2999 &pmode, &punsignedp, &pvolatilep, false);
3000 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3002 if (TREE_CODE (base) == MEM_REF)
3004 if (!integer_zerop (TREE_OPERAND (base, 1)))
3006 if (off == NULL_TREE)
3008 double_int moff = mem_ref_offset (base);
3009 off = double_int_to_tree (sizetype, moff);
3011 else
3012 off = size_binop (PLUS_EXPR, off,
3013 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3015 base = TREE_OPERAND (base, 0);
3017 else
3018 base = build_fold_addr_expr (base);
3020 if (off == NULL_TREE)
3021 off = size_zero_node;
3023 /* If base is not loop invariant, either off is 0, then we start with just
3024 the constant offset in the loop invariant BASE and continue with base
3025 as OFF, otherwise give up.
3026 We could handle that case by gimplifying the addition of base + off
3027 into some SSA_NAME and use that as off, but for now punt. */
3028 if (!expr_invariant_in_loop_p (loop, base))
3030 if (!integer_zerop (off))
3031 return NULL_TREE;
3032 off = base;
3033 base = size_int (pbitpos / BITS_PER_UNIT);
3035 /* Otherwise put base + constant offset into the loop invariant BASE
3036 and continue with OFF. */
3037 else
3039 base = fold_convert (sizetype, base);
3040 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3043 /* OFF at this point may be either a SSA_NAME or some tree expression
3044 from get_inner_reference. Try to peel off loop invariants from it
3045 into BASE as long as possible. */
3046 STRIP_NOPS (off);
3047 while (offtype == NULL_TREE)
3049 enum tree_code code;
3050 tree op0, op1, add = NULL_TREE;
3052 if (TREE_CODE (off) == SSA_NAME)
3054 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3056 if (expr_invariant_in_loop_p (loop, off))
3057 return NULL_TREE;
3059 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3060 break;
3062 op0 = gimple_assign_rhs1 (def_stmt);
3063 code = gimple_assign_rhs_code (def_stmt);
3064 op1 = gimple_assign_rhs2 (def_stmt);
3066 else
3068 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3069 return NULL_TREE;
3070 code = TREE_CODE (off);
3071 extract_ops_from_tree (off, &code, &op0, &op1);
3073 switch (code)
3075 case POINTER_PLUS_EXPR:
3076 case PLUS_EXPR:
3077 if (expr_invariant_in_loop_p (loop, op0))
3079 add = op0;
3080 off = op1;
3081 do_add:
3082 add = fold_convert (sizetype, add);
3083 if (scale != 1)
3084 add = size_binop (MULT_EXPR, add, size_int (scale));
3085 base = size_binop (PLUS_EXPR, base, add);
3086 continue;
3088 if (expr_invariant_in_loop_p (loop, op1))
3090 add = op1;
3091 off = op0;
3092 goto do_add;
3094 break;
3095 case MINUS_EXPR:
3096 if (expr_invariant_in_loop_p (loop, op1))
3098 add = fold_convert (sizetype, op1);
3099 add = size_binop (MINUS_EXPR, size_zero_node, add);
3100 off = op0;
3101 goto do_add;
3103 break;
3104 case MULT_EXPR:
3105 if (scale == 1 && tree_fits_shwi_p (op1))
3107 scale = tree_to_shwi (op1);
3108 off = op0;
3109 continue;
3111 break;
3112 case SSA_NAME:
3113 off = op0;
3114 continue;
3115 CASE_CONVERT:
3116 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3117 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3118 break;
3119 if (TYPE_PRECISION (TREE_TYPE (op0))
3120 == TYPE_PRECISION (TREE_TYPE (off)))
3122 off = op0;
3123 continue;
3125 if (TYPE_PRECISION (TREE_TYPE (op0))
3126 < TYPE_PRECISION (TREE_TYPE (off)))
3128 off = op0;
3129 offtype = TREE_TYPE (off);
3130 STRIP_NOPS (off);
3131 continue;
3133 break;
3134 default:
3135 break;
3137 break;
3140 /* If at the end OFF still isn't a SSA_NAME or isn't
3141 defined in the loop, punt. */
3142 if (TREE_CODE (off) != SSA_NAME
3143 || expr_invariant_in_loop_p (loop, off))
3144 return NULL_TREE;
3146 if (offtype == NULL_TREE)
3147 offtype = TREE_TYPE (off);
3149 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3150 offtype, scale);
3151 if (decl == NULL_TREE)
3152 return NULL_TREE;
3154 if (basep)
3155 *basep = base;
3156 if (offp)
3157 *offp = off;
3158 if (scalep)
3159 *scalep = scale;
3160 return decl;
3163 /* Function vect_analyze_data_refs.
3165 Find all the data references in the loop or basic block.
3167 The general structure of the analysis of data refs in the vectorizer is as
3168 follows:
3169 1- vect_analyze_data_refs(loop/bb): call
3170 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3171 in the loop/bb and their dependences.
3172 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3173 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3174 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3178 bool
3179 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3180 bb_vec_info bb_vinfo,
3181 int *min_vf, unsigned *n_stmts)
3183 struct loop *loop = NULL;
3184 basic_block bb = NULL;
3185 unsigned int i;
3186 vec<data_reference_p> datarefs;
3187 struct data_reference *dr;
3188 tree scalar_type;
3190 if (dump_enabled_p ())
3191 dump_printf_loc (MSG_NOTE, vect_location,
3192 "=== vect_analyze_data_refs ===\n");
3194 if (loop_vinfo)
3196 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3198 loop = LOOP_VINFO_LOOP (loop_vinfo);
3199 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3200 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3202 if (dump_enabled_p ())
3203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3204 "not vectorized: loop contains function calls"
3205 " or data references that cannot be analyzed\n");
3206 return false;
3209 for (i = 0; i < loop->num_nodes; i++)
3211 gimple_stmt_iterator gsi;
3213 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3215 gimple stmt = gsi_stmt (gsi);
3216 if (is_gimple_debug (stmt))
3217 continue;
3218 ++*n_stmts;
3219 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3221 if (is_gimple_call (stmt) && loop->safelen)
3223 tree fndecl = gimple_call_fndecl (stmt), op;
3224 if (fndecl != NULL_TREE)
3226 struct cgraph_node *node = cgraph_get_node (fndecl);
3227 if (node != NULL && node->simd_clones != NULL)
3229 unsigned int j, n = gimple_call_num_args (stmt);
3230 for (j = 0; j < n; j++)
3232 op = gimple_call_arg (stmt, j);
3233 if (DECL_P (op)
3234 || (REFERENCE_CLASS_P (op)
3235 && get_base_address (op)))
3236 break;
3238 op = gimple_call_lhs (stmt);
3239 /* Ignore #pragma omp declare simd functions
3240 if they don't have data references in the
3241 call stmt itself. */
3242 if (j == n
3243 && !(op
3244 && (DECL_P (op)
3245 || (REFERENCE_CLASS_P (op)
3246 && get_base_address (op)))))
3247 continue;
3251 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3252 if (dump_enabled_p ())
3253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3254 "not vectorized: loop contains function "
3255 "calls or data references that cannot "
3256 "be analyzed\n");
3257 return false;
3262 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3264 else
3266 gimple_stmt_iterator gsi;
3268 bb = BB_VINFO_BB (bb_vinfo);
3269 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3271 gimple stmt = gsi_stmt (gsi);
3272 if (is_gimple_debug (stmt))
3273 continue;
3274 ++*n_stmts;
3275 if (!find_data_references_in_stmt (NULL, stmt,
3276 &BB_VINFO_DATAREFS (bb_vinfo)))
3278 /* Mark the rest of the basic-block as unvectorizable. */
3279 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3281 stmt = gsi_stmt (gsi);
3282 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3284 break;
3288 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3291 /* Go through the data-refs, check that the analysis succeeded. Update
3292 pointer from stmt_vec_info struct to DR and vectype. */
3294 FOR_EACH_VEC_ELT (datarefs, i, dr)
3296 gimple stmt;
3297 stmt_vec_info stmt_info;
3298 tree base, offset, init;
3299 bool gather = false;
3300 bool simd_lane_access = false;
3301 int vf;
3303 again:
3304 if (!dr || !DR_REF (dr))
3306 if (dump_enabled_p ())
3307 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3308 "not vectorized: unhandled data-ref\n");
3309 return false;
3312 stmt = DR_STMT (dr);
3313 stmt_info = vinfo_for_stmt (stmt);
3315 /* Discard clobbers from the dataref vector. We will remove
3316 clobber stmts during vectorization. */
3317 if (gimple_clobber_p (stmt))
3319 free_data_ref (dr);
3320 if (i == datarefs.length () - 1)
3322 datarefs.pop ();
3323 break;
3325 datarefs.ordered_remove (i);
3326 dr = datarefs[i];
3327 goto again;
3330 /* Check that analysis of the data-ref succeeded. */
3331 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3332 || !DR_STEP (dr))
3334 bool maybe_gather
3335 = DR_IS_READ (dr)
3336 && !TREE_THIS_VOLATILE (DR_REF (dr))
3337 && targetm.vectorize.builtin_gather != NULL;
3338 bool maybe_simd_lane_access
3339 = loop_vinfo && loop->simduid;
3341 /* If target supports vector gather loads, or if this might be
3342 a SIMD lane access, see if they can't be used. */
3343 if (loop_vinfo
3344 && (maybe_gather || maybe_simd_lane_access)
3345 && !nested_in_vect_loop_p (loop, stmt))
3347 struct data_reference *newdr
3348 = create_data_ref (NULL, loop_containing_stmt (stmt),
3349 DR_REF (dr), stmt, true);
3350 gcc_assert (newdr != NULL && DR_REF (newdr));
3351 if (DR_BASE_ADDRESS (newdr)
3352 && DR_OFFSET (newdr)
3353 && DR_INIT (newdr)
3354 && DR_STEP (newdr)
3355 && integer_zerop (DR_STEP (newdr)))
3357 if (maybe_simd_lane_access)
3359 tree off = DR_OFFSET (newdr);
3360 STRIP_NOPS (off);
3361 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3362 && TREE_CODE (off) == MULT_EXPR
3363 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3365 tree step = TREE_OPERAND (off, 1);
3366 off = TREE_OPERAND (off, 0);
3367 STRIP_NOPS (off);
3368 if (CONVERT_EXPR_P (off)
3369 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3370 0)))
3371 < TYPE_PRECISION (TREE_TYPE (off)))
3372 off = TREE_OPERAND (off, 0);
3373 if (TREE_CODE (off) == SSA_NAME)
3375 gimple def = SSA_NAME_DEF_STMT (off);
3376 tree reft = TREE_TYPE (DR_REF (newdr));
3377 if (is_gimple_call (def)
3378 && gimple_call_internal_p (def)
3379 && (gimple_call_internal_fn (def)
3380 == IFN_GOMP_SIMD_LANE))
3382 tree arg = gimple_call_arg (def, 0);
3383 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3384 arg = SSA_NAME_VAR (arg);
3385 if (arg == loop->simduid
3386 /* For now. */
3387 && tree_int_cst_equal
3388 (TYPE_SIZE_UNIT (reft),
3389 step))
3391 DR_OFFSET (newdr) = ssize_int (0);
3392 DR_STEP (newdr) = step;
3393 DR_ALIGNED_TO (newdr)
3394 = size_int (BIGGEST_ALIGNMENT);
3395 dr = newdr;
3396 simd_lane_access = true;
3402 if (!simd_lane_access && maybe_gather)
3404 dr = newdr;
3405 gather = true;
3408 if (!gather && !simd_lane_access)
3409 free_data_ref (newdr);
3412 if (!gather && !simd_lane_access)
3414 if (dump_enabled_p ())
3416 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3417 "not vectorized: data ref analysis "
3418 "failed ");
3419 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3420 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3423 if (bb_vinfo)
3424 break;
3426 return false;
3430 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3432 if (dump_enabled_p ())
3433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3434 "not vectorized: base addr of dr is a "
3435 "constant\n");
3437 if (bb_vinfo)
3438 break;
3440 if (gather || simd_lane_access)
3441 free_data_ref (dr);
3442 return false;
3445 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3447 if (dump_enabled_p ())
3449 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3450 "not vectorized: volatile type ");
3451 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3452 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3455 if (bb_vinfo)
3456 break;
3458 return false;
3461 if (stmt_can_throw_internal (stmt))
3463 if (dump_enabled_p ())
3465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3466 "not vectorized: statement can throw an "
3467 "exception ");
3468 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3469 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3472 if (bb_vinfo)
3473 break;
3475 if (gather || simd_lane_access)
3476 free_data_ref (dr);
3477 return false;
3480 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3481 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3483 if (dump_enabled_p ())
3485 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3486 "not vectorized: statement is bitfield "
3487 "access ");
3488 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3489 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3492 if (bb_vinfo)
3493 break;
3495 if (gather || simd_lane_access)
3496 free_data_ref (dr);
3497 return false;
3500 base = unshare_expr (DR_BASE_ADDRESS (dr));
3501 offset = unshare_expr (DR_OFFSET (dr));
3502 init = unshare_expr (DR_INIT (dr));
3504 if (is_gimple_call (stmt)
3505 && (!gimple_call_internal_p (stmt)
3506 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3507 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3509 if (dump_enabled_p ())
3511 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3512 "not vectorized: dr in a call ");
3513 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3514 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3517 if (bb_vinfo)
3518 break;
3520 if (gather || simd_lane_access)
3521 free_data_ref (dr);
3522 return false;
3525 /* Update DR field in stmt_vec_info struct. */
3527 /* If the dataref is in an inner-loop of the loop that is considered for
3528 for vectorization, we also want to analyze the access relative to
3529 the outer-loop (DR contains information only relative to the
3530 inner-most enclosing loop). We do that by building a reference to the
3531 first location accessed by the inner-loop, and analyze it relative to
3532 the outer-loop. */
3533 if (loop && nested_in_vect_loop_p (loop, stmt))
3535 tree outer_step, outer_base, outer_init;
3536 HOST_WIDE_INT pbitsize, pbitpos;
3537 tree poffset;
3538 enum machine_mode pmode;
3539 int punsignedp, pvolatilep;
3540 affine_iv base_iv, offset_iv;
3541 tree dinit;
3543 /* Build a reference to the first location accessed by the
3544 inner-loop: *(BASE+INIT). (The first location is actually
3545 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3546 tree inner_base = build_fold_indirect_ref
3547 (fold_build_pointer_plus (base, init));
3549 if (dump_enabled_p ())
3551 dump_printf_loc (MSG_NOTE, vect_location,
3552 "analyze in outer-loop: ");
3553 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3554 dump_printf (MSG_NOTE, "\n");
3557 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3558 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3559 gcc_assert (outer_base != NULL_TREE);
3561 if (pbitpos % BITS_PER_UNIT != 0)
3563 if (dump_enabled_p ())
3564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3565 "failed: bit offset alignment.\n");
3566 return false;
3569 outer_base = build_fold_addr_expr (outer_base);
3570 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3571 &base_iv, false))
3573 if (dump_enabled_p ())
3574 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3575 "failed: evolution of base is not affine.\n");
3576 return false;
3579 if (offset)
3581 if (poffset)
3582 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3583 poffset);
3584 else
3585 poffset = offset;
3588 if (!poffset)
3590 offset_iv.base = ssize_int (0);
3591 offset_iv.step = ssize_int (0);
3593 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3594 &offset_iv, false))
3596 if (dump_enabled_p ())
3597 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3598 "evolution of offset is not affine.\n");
3599 return false;
3602 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3603 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3604 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3605 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3606 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3608 outer_step = size_binop (PLUS_EXPR,
3609 fold_convert (ssizetype, base_iv.step),
3610 fold_convert (ssizetype, offset_iv.step));
3612 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3613 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3614 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3615 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3616 STMT_VINFO_DR_OFFSET (stmt_info) =
3617 fold_convert (ssizetype, offset_iv.base);
3618 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3619 size_int (highest_pow2_factor (offset_iv.base));
3621 if (dump_enabled_p ())
3623 dump_printf_loc (MSG_NOTE, vect_location,
3624 "\touter base_address: ");
3625 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3626 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3627 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3628 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3629 STMT_VINFO_DR_OFFSET (stmt_info));
3630 dump_printf (MSG_NOTE,
3631 "\n\touter constant offset from base address: ");
3632 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3633 STMT_VINFO_DR_INIT (stmt_info));
3634 dump_printf (MSG_NOTE, "\n\touter step: ");
3635 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3636 STMT_VINFO_DR_STEP (stmt_info));
3637 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3638 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3639 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3640 dump_printf (MSG_NOTE, "\n");
3644 if (STMT_VINFO_DATA_REF (stmt_info))
3646 if (dump_enabled_p ())
3648 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3649 "not vectorized: more than one data ref "
3650 "in stmt: ");
3651 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3652 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3655 if (bb_vinfo)
3656 break;
3658 if (gather || simd_lane_access)
3659 free_data_ref (dr);
3660 return false;
3663 STMT_VINFO_DATA_REF (stmt_info) = dr;
3664 if (simd_lane_access)
3666 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3667 free_data_ref (datarefs[i]);
3668 datarefs[i] = dr;
3671 /* Set vectype for STMT. */
3672 scalar_type = TREE_TYPE (DR_REF (dr));
3673 STMT_VINFO_VECTYPE (stmt_info)
3674 = get_vectype_for_scalar_type (scalar_type);
3675 if (!STMT_VINFO_VECTYPE (stmt_info))
3677 if (dump_enabled_p ())
3679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3680 "not vectorized: no vectype for stmt: ");
3681 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3682 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3683 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3684 scalar_type);
3685 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3688 if (bb_vinfo)
3689 break;
3691 if (gather || simd_lane_access)
3693 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3694 if (gather)
3695 free_data_ref (dr);
3697 return false;
3699 else
3701 if (dump_enabled_p ())
3703 dump_printf_loc (MSG_NOTE, vect_location,
3704 "got vectype for stmt: ");
3705 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3706 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3707 STMT_VINFO_VECTYPE (stmt_info));
3708 dump_printf (MSG_NOTE, "\n");
3712 /* Adjust the minimal vectorization factor according to the
3713 vector type. */
3714 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3715 if (vf > *min_vf)
3716 *min_vf = vf;
3718 if (gather)
3720 tree off;
3722 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3723 if (gather
3724 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3725 gather = false;
3726 if (!gather)
3728 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3729 free_data_ref (dr);
3730 if (dump_enabled_p ())
3732 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3733 "not vectorized: not suitable for gather "
3734 "load ");
3735 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3736 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3738 return false;
3741 datarefs[i] = dr;
3742 STMT_VINFO_GATHER_P (stmt_info) = true;
3744 else if (loop_vinfo
3745 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3747 if (nested_in_vect_loop_p (loop, stmt)
3748 || !DR_IS_READ (dr))
3750 if (dump_enabled_p ())
3752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3753 "not vectorized: not suitable for strided "
3754 "load ");
3755 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3756 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3758 return false;
3760 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3764 /* If we stopped analysis at the first dataref we could not analyze
3765 when trying to vectorize a basic-block mark the rest of the datarefs
3766 as not vectorizable and truncate the vector of datarefs. That
3767 avoids spending useless time in analyzing their dependence. */
3768 if (i != datarefs.length ())
3770 gcc_assert (bb_vinfo != NULL);
3771 for (unsigned j = i; j < datarefs.length (); ++j)
3773 data_reference_p dr = datarefs[j];
3774 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3775 free_data_ref (dr);
3777 datarefs.truncate (i);
3780 return true;
3784 /* Function vect_get_new_vect_var.
3786 Returns a name for a new variable. The current naming scheme appends the
3787 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3788 the name of vectorizer generated variables, and appends that to NAME if
3789 provided. */
3791 tree
3792 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3794 const char *prefix;
3795 tree new_vect_var;
3797 switch (var_kind)
3799 case vect_simple_var:
3800 prefix = "vect";
3801 break;
3802 case vect_scalar_var:
3803 prefix = "stmp";
3804 break;
3805 case vect_pointer_var:
3806 prefix = "vectp";
3807 break;
3808 default:
3809 gcc_unreachable ();
3812 if (name)
3814 char* tmp = concat (prefix, "_", name, NULL);
3815 new_vect_var = create_tmp_reg (type, tmp);
3816 free (tmp);
3818 else
3819 new_vect_var = create_tmp_reg (type, prefix);
3821 return new_vect_var;
3825 /* Function vect_create_addr_base_for_vector_ref.
3827 Create an expression that computes the address of the first memory location
3828 that will be accessed for a data reference.
3830 Input:
3831 STMT: The statement containing the data reference.
3832 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3833 OFFSET: Optional. If supplied, it is be added to the initial address.
3834 LOOP: Specify relative to which loop-nest should the address be computed.
3835 For example, when the dataref is in an inner-loop nested in an
3836 outer-loop that is now being vectorized, LOOP can be either the
3837 outer-loop, or the inner-loop. The first memory location accessed
3838 by the following dataref ('in' points to short):
3840 for (i=0; i<N; i++)
3841 for (j=0; j<M; j++)
3842 s += in[i+j]
3844 is as follows:
3845 if LOOP=i_loop: &in (relative to i_loop)
3846 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3847 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3848 initial address. Unlike OFFSET, which is number of elements to
3849 be added, BYTE_OFFSET is measured in bytes.
3851 Output:
3852 1. Return an SSA_NAME whose value is the address of the memory location of
3853 the first vector of the data reference.
3854 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3855 these statement(s) which define the returned SSA_NAME.
3857 FORNOW: We are only handling array accesses with step 1. */
3859 tree
3860 vect_create_addr_base_for_vector_ref (gimple stmt,
3861 gimple_seq *new_stmt_list,
3862 tree offset,
3863 struct loop *loop,
3864 tree byte_offset)
3866 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3867 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3868 tree data_ref_base;
3869 const char *base_name;
3870 tree addr_base;
3871 tree dest;
3872 gimple_seq seq = NULL;
3873 tree base_offset;
3874 tree init;
3875 tree vect_ptr_type;
3876 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3877 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3879 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3881 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3883 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3885 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3886 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3887 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3889 else
3891 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3892 base_offset = unshare_expr (DR_OFFSET (dr));
3893 init = unshare_expr (DR_INIT (dr));
3896 if (loop_vinfo)
3897 base_name = get_name (data_ref_base);
3898 else
3900 base_offset = ssize_int (0);
3901 init = ssize_int (0);
3902 base_name = get_name (DR_REF (dr));
3905 /* Create base_offset */
3906 base_offset = size_binop (PLUS_EXPR,
3907 fold_convert (sizetype, base_offset),
3908 fold_convert (sizetype, init));
3910 if (offset)
3912 offset = fold_build2 (MULT_EXPR, sizetype,
3913 fold_convert (sizetype, offset), step);
3914 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3915 base_offset, offset);
3917 if (byte_offset)
3919 byte_offset = fold_convert (sizetype, byte_offset);
3920 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3921 base_offset, byte_offset);
3924 /* base + base_offset */
3925 if (loop_vinfo)
3926 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3927 else
3929 addr_base = build1 (ADDR_EXPR,
3930 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3931 unshare_expr (DR_REF (dr)));
3934 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3935 addr_base = fold_convert (vect_ptr_type, addr_base);
3936 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3937 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3938 gimple_seq_add_seq (new_stmt_list, seq);
3940 if (DR_PTR_INFO (dr)
3941 && TREE_CODE (addr_base) == SSA_NAME)
3943 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3944 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3945 int misalign = DR_MISALIGNMENT (dr);
3946 if (offset || byte_offset || (misalign == -1))
3947 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3948 else
3949 set_ptr_info_alignment (SSA_NAME_PTR_INFO (addr_base), align, misalign);
3952 if (dump_enabled_p ())
3954 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3955 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3956 dump_printf (MSG_NOTE, "\n");
3959 return addr_base;
3963 /* Function vect_create_data_ref_ptr.
3965 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3966 location accessed in the loop by STMT, along with the def-use update
3967 chain to appropriately advance the pointer through the loop iterations.
3968 Also set aliasing information for the pointer. This pointer is used by
3969 the callers to this function to create a memory reference expression for
3970 vector load/store access.
3972 Input:
3973 1. STMT: a stmt that references memory. Expected to be of the form
3974 GIMPLE_ASSIGN <name, data-ref> or
3975 GIMPLE_ASSIGN <data-ref, name>.
3976 2. AGGR_TYPE: the type of the reference, which should be either a vector
3977 or an array.
3978 3. AT_LOOP: the loop where the vector memref is to be created.
3979 4. OFFSET (optional): an offset to be added to the initial address accessed
3980 by the data-ref in STMT.
3981 5. BSI: location where the new stmts are to be placed if there is no loop
3982 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3983 pointing to the initial address.
3984 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
3985 to the initial address accessed by the data-ref in STMT. This is
3986 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
3987 in bytes.
3989 Output:
3990 1. Declare a new ptr to vector_type, and have it point to the base of the
3991 data reference (initial addressed accessed by the data reference).
3992 For example, for vector of type V8HI, the following code is generated:
3994 v8hi *ap;
3995 ap = (v8hi *)initial_address;
3997 if OFFSET is not supplied:
3998 initial_address = &a[init];
3999 if OFFSET is supplied:
4000 initial_address = &a[init + OFFSET];
4001 if BYTE_OFFSET is supplied:
4002 initial_address = &a[init] + BYTE_OFFSET;
4004 Return the initial_address in INITIAL_ADDRESS.
4006 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4007 update the pointer in each iteration of the loop.
4009 Return the increment stmt that updates the pointer in PTR_INCR.
4011 3. Set INV_P to true if the access pattern of the data reference in the
4012 vectorized loop is invariant. Set it to false otherwise.
4014 4. Return the pointer. */
4016 tree
4017 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4018 tree offset, tree *initial_address,
4019 gimple_stmt_iterator *gsi, gimple *ptr_incr,
4020 bool only_init, bool *inv_p, tree byte_offset)
4022 const char *base_name;
4023 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4024 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4025 struct loop *loop = NULL;
4026 bool nested_in_vect_loop = false;
4027 struct loop *containing_loop = NULL;
4028 tree aggr_ptr_type;
4029 tree aggr_ptr;
4030 tree new_temp;
4031 gimple vec_stmt;
4032 gimple_seq new_stmt_list = NULL;
4033 edge pe = NULL;
4034 basic_block new_bb;
4035 tree aggr_ptr_init;
4036 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4037 tree aptr;
4038 gimple_stmt_iterator incr_gsi;
4039 bool insert_after;
4040 tree indx_before_incr, indx_after_incr;
4041 gimple incr;
4042 tree step;
4043 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4045 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4046 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4048 if (loop_vinfo)
4050 loop = LOOP_VINFO_LOOP (loop_vinfo);
4051 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4052 containing_loop = (gimple_bb (stmt))->loop_father;
4053 pe = loop_preheader_edge (loop);
4055 else
4057 gcc_assert (bb_vinfo);
4058 only_init = true;
4059 *ptr_incr = NULL;
4062 /* Check the step (evolution) of the load in LOOP, and record
4063 whether it's invariant. */
4064 if (nested_in_vect_loop)
4065 step = STMT_VINFO_DR_STEP (stmt_info);
4066 else
4067 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4069 if (integer_zerop (step))
4070 *inv_p = true;
4071 else
4072 *inv_p = false;
4074 /* Create an expression for the first address accessed by this load
4075 in LOOP. */
4076 base_name = get_name (DR_BASE_ADDRESS (dr));
4078 if (dump_enabled_p ())
4080 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4081 dump_printf_loc (MSG_NOTE, vect_location,
4082 "create %s-pointer variable to type: ",
4083 get_tree_code_name (TREE_CODE (aggr_type)));
4084 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4085 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4086 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4087 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4088 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4089 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4090 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4091 else
4092 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4093 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4094 dump_printf (MSG_NOTE, "\n");
4097 /* (1) Create the new aggregate-pointer variable.
4098 Vector and array types inherit the alias set of their component
4099 type by default so we need to use a ref-all pointer if the data
4100 reference does not conflict with the created aggregated data
4101 reference because it is not addressable. */
4102 bool need_ref_all = false;
4103 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4104 get_alias_set (DR_REF (dr))))
4105 need_ref_all = true;
4106 /* Likewise for any of the data references in the stmt group. */
4107 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4109 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4112 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4113 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4114 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4115 get_alias_set (DR_REF (sdr))))
4117 need_ref_all = true;
4118 break;
4120 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4122 while (orig_stmt);
4124 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4125 need_ref_all);
4126 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4129 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4130 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4131 def-use update cycles for the pointer: one relative to the outer-loop
4132 (LOOP), which is what steps (3) and (4) below do. The other is relative
4133 to the inner-loop (which is the inner-most loop containing the dataref),
4134 and this is done be step (5) below.
4136 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4137 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4138 redundant. Steps (3),(4) create the following:
4140 vp0 = &base_addr;
4141 LOOP: vp1 = phi(vp0,vp2)
4144 vp2 = vp1 + step
4145 goto LOOP
4147 If there is an inner-loop nested in loop, then step (5) will also be
4148 applied, and an additional update in the inner-loop will be created:
4150 vp0 = &base_addr;
4151 LOOP: vp1 = phi(vp0,vp2)
4153 inner: vp3 = phi(vp1,vp4)
4154 vp4 = vp3 + inner_step
4155 if () goto inner
4157 vp2 = vp1 + step
4158 if () goto LOOP */
4160 /* (2) Calculate the initial address of the aggregate-pointer, and set
4161 the aggregate-pointer to point to it before the loop. */
4163 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4165 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4166 offset, loop, byte_offset);
4167 if (new_stmt_list)
4169 if (pe)
4171 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4172 gcc_assert (!new_bb);
4174 else
4175 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4178 *initial_address = new_temp;
4180 /* Create: p = (aggr_type *) initial_base */
4181 if (TREE_CODE (new_temp) != SSA_NAME
4182 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4184 vec_stmt = gimple_build_assign (aggr_ptr,
4185 fold_convert (aggr_ptr_type, new_temp));
4186 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4187 /* Copy the points-to information if it exists. */
4188 if (DR_PTR_INFO (dr))
4189 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4190 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4191 if (pe)
4193 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4194 gcc_assert (!new_bb);
4196 else
4197 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4199 else
4200 aggr_ptr_init = new_temp;
4202 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4203 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4204 inner-loop nested in LOOP (during outer-loop vectorization). */
4206 /* No update in loop is required. */
4207 if (only_init && (!loop_vinfo || at_loop == loop))
4208 aptr = aggr_ptr_init;
4209 else
4211 /* The step of the aggregate pointer is the type size. */
4212 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4213 /* One exception to the above is when the scalar step of the load in
4214 LOOP is zero. In this case the step here is also zero. */
4215 if (*inv_p)
4216 iv_step = size_zero_node;
4217 else if (tree_int_cst_sgn (step) == -1)
4218 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4220 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4222 create_iv (aggr_ptr_init,
4223 fold_convert (aggr_ptr_type, iv_step),
4224 aggr_ptr, loop, &incr_gsi, insert_after,
4225 &indx_before_incr, &indx_after_incr);
4226 incr = gsi_stmt (incr_gsi);
4227 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4229 /* Copy the points-to information if it exists. */
4230 if (DR_PTR_INFO (dr))
4232 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4233 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4235 if (ptr_incr)
4236 *ptr_incr = incr;
4238 aptr = indx_before_incr;
4241 if (!nested_in_vect_loop || only_init)
4242 return aptr;
4245 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4246 nested in LOOP, if exists. */
4248 gcc_assert (nested_in_vect_loop);
4249 if (!only_init)
4251 standard_iv_increment_position (containing_loop, &incr_gsi,
4252 &insert_after);
4253 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4254 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4255 &indx_after_incr);
4256 incr = gsi_stmt (incr_gsi);
4257 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4259 /* Copy the points-to information if it exists. */
4260 if (DR_PTR_INFO (dr))
4262 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4263 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4265 if (ptr_incr)
4266 *ptr_incr = incr;
4268 return indx_before_incr;
4270 else
4271 gcc_unreachable ();
4275 /* Function bump_vector_ptr
4277 Increment a pointer (to a vector type) by vector-size. If requested,
4278 i.e. if PTR-INCR is given, then also connect the new increment stmt
4279 to the existing def-use update-chain of the pointer, by modifying
4280 the PTR_INCR as illustrated below:
4282 The pointer def-use update-chain before this function:
4283 DATAREF_PTR = phi (p_0, p_2)
4284 ....
4285 PTR_INCR: p_2 = DATAREF_PTR + step
4287 The pointer def-use update-chain after this function:
4288 DATAREF_PTR = phi (p_0, p_2)
4289 ....
4290 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4291 ....
4292 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4294 Input:
4295 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4296 in the loop.
4297 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4298 the loop. The increment amount across iterations is expected
4299 to be vector_size.
4300 BSI - location where the new update stmt is to be placed.
4301 STMT - the original scalar memory-access stmt that is being vectorized.
4302 BUMP - optional. The offset by which to bump the pointer. If not given,
4303 the offset is assumed to be vector_size.
4305 Output: Return NEW_DATAREF_PTR as illustrated above.
4309 tree
4310 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4311 gimple stmt, tree bump)
4313 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4314 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4315 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4316 tree update = TYPE_SIZE_UNIT (vectype);
4317 gimple incr_stmt;
4318 ssa_op_iter iter;
4319 use_operand_p use_p;
4320 tree new_dataref_ptr;
4322 if (bump)
4323 update = bump;
4325 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4326 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4327 dataref_ptr, update);
4328 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4330 /* Copy the points-to information if it exists. */
4331 if (DR_PTR_INFO (dr))
4333 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4334 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4337 if (!ptr_incr)
4338 return new_dataref_ptr;
4340 /* Update the vector-pointer's cross-iteration increment. */
4341 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4343 tree use = USE_FROM_PTR (use_p);
4345 if (use == dataref_ptr)
4346 SET_USE (use_p, new_dataref_ptr);
4347 else
4348 gcc_assert (tree_int_cst_compare (use, update) == 0);
4351 return new_dataref_ptr;
4355 /* Function vect_create_destination_var.
4357 Create a new temporary of type VECTYPE. */
4359 tree
4360 vect_create_destination_var (tree scalar_dest, tree vectype)
4362 tree vec_dest;
4363 const char *name;
4364 char *new_name;
4365 tree type;
4366 enum vect_var_kind kind;
4368 kind = vectype ? vect_simple_var : vect_scalar_var;
4369 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4371 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4373 name = get_name (scalar_dest);
4374 if (name)
4375 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4376 else
4377 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
4378 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4379 free (new_name);
4381 return vec_dest;
4384 /* Function vect_grouped_store_supported.
4386 Returns TRUE if interleave high and interleave low permutations
4387 are supported, and FALSE otherwise. */
4389 bool
4390 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4392 enum machine_mode mode = TYPE_MODE (vectype);
4394 /* vect_permute_store_chain requires the group size to be a power of two. */
4395 if (exact_log2 (count) == -1)
4397 if (dump_enabled_p ())
4398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4399 "the size of the group of accesses"
4400 " is not a power of 2\n");
4401 return false;
4404 /* Check that the permutation is supported. */
4405 if (VECTOR_MODE_P (mode))
4407 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4408 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4409 for (i = 0; i < nelt / 2; i++)
4411 sel[i * 2] = i;
4412 sel[i * 2 + 1] = i + nelt;
4414 if (can_vec_perm_p (mode, false, sel))
4416 for (i = 0; i < nelt; i++)
4417 sel[i] += nelt / 2;
4418 if (can_vec_perm_p (mode, false, sel))
4419 return true;
4423 if (dump_enabled_p ())
4424 dump_printf (MSG_MISSED_OPTIMIZATION,
4425 "interleave op not supported by target.\n");
4426 return false;
4430 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4431 type VECTYPE. */
4433 bool
4434 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4436 return vect_lanes_optab_supported_p ("vec_store_lanes",
4437 vec_store_lanes_optab,
4438 vectype, count);
4442 /* Function vect_permute_store_chain.
4444 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4445 a power of 2, generate interleave_high/low stmts to reorder the data
4446 correctly for the stores. Return the final references for stores in
4447 RESULT_CHAIN.
4449 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4450 The input is 4 vectors each containing 8 elements. We assign a number to
4451 each element, the input sequence is:
4453 1st vec: 0 1 2 3 4 5 6 7
4454 2nd vec: 8 9 10 11 12 13 14 15
4455 3rd vec: 16 17 18 19 20 21 22 23
4456 4th vec: 24 25 26 27 28 29 30 31
4458 The output sequence should be:
4460 1st vec: 0 8 16 24 1 9 17 25
4461 2nd vec: 2 10 18 26 3 11 19 27
4462 3rd vec: 4 12 20 28 5 13 21 30
4463 4th vec: 6 14 22 30 7 15 23 31
4465 i.e., we interleave the contents of the four vectors in their order.
4467 We use interleave_high/low instructions to create such output. The input of
4468 each interleave_high/low operation is two vectors:
4469 1st vec 2nd vec
4470 0 1 2 3 4 5 6 7
4471 the even elements of the result vector are obtained left-to-right from the
4472 high/low elements of the first vector. The odd elements of the result are
4473 obtained left-to-right from the high/low elements of the second vector.
4474 The output of interleave_high will be: 0 4 1 5
4475 and of interleave_low: 2 6 3 7
4478 The permutation is done in log LENGTH stages. In each stage interleave_high
4479 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4480 where the first argument is taken from the first half of DR_CHAIN and the
4481 second argument from it's second half.
4482 In our example,
4484 I1: interleave_high (1st vec, 3rd vec)
4485 I2: interleave_low (1st vec, 3rd vec)
4486 I3: interleave_high (2nd vec, 4th vec)
4487 I4: interleave_low (2nd vec, 4th vec)
4489 The output for the first stage is:
4491 I1: 0 16 1 17 2 18 3 19
4492 I2: 4 20 5 21 6 22 7 23
4493 I3: 8 24 9 25 10 26 11 27
4494 I4: 12 28 13 29 14 30 15 31
4496 The output of the second stage, i.e. the final result is:
4498 I1: 0 8 16 24 1 9 17 25
4499 I2: 2 10 18 26 3 11 19 27
4500 I3: 4 12 20 28 5 13 21 30
4501 I4: 6 14 22 30 7 15 23 31. */
4503 void
4504 vect_permute_store_chain (vec<tree> dr_chain,
4505 unsigned int length,
4506 gimple stmt,
4507 gimple_stmt_iterator *gsi,
4508 vec<tree> *result_chain)
4510 tree vect1, vect2, high, low;
4511 gimple perm_stmt;
4512 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4513 tree perm_mask_low, perm_mask_high;
4514 unsigned int i, n;
4515 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4516 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4518 result_chain->quick_grow (length);
4519 memcpy (result_chain->address (), dr_chain.address (),
4520 length * sizeof (tree));
4522 for (i = 0, n = nelt / 2; i < n; i++)
4524 sel[i * 2] = i;
4525 sel[i * 2 + 1] = i + nelt;
4527 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4528 gcc_assert (perm_mask_high != NULL);
4530 for (i = 0; i < nelt; i++)
4531 sel[i] += nelt / 2;
4532 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4533 gcc_assert (perm_mask_low != NULL);
4535 for (i = 0, n = exact_log2 (length); i < n; i++)
4537 for (j = 0; j < length/2; j++)
4539 vect1 = dr_chain[j];
4540 vect2 = dr_chain[j+length/2];
4542 /* Create interleaving stmt:
4543 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4544 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4545 perm_stmt
4546 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4547 vect1, vect2, perm_mask_high);
4548 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4549 (*result_chain)[2*j] = high;
4551 /* Create interleaving stmt:
4552 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4553 nelt*3/2+1, ...}> */
4554 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4555 perm_stmt
4556 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4557 vect1, vect2, perm_mask_low);
4558 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4559 (*result_chain)[2*j+1] = low;
4561 memcpy (dr_chain.address (), result_chain->address (),
4562 length * sizeof (tree));
4566 /* Function vect_setup_realignment
4568 This function is called when vectorizing an unaligned load using
4569 the dr_explicit_realign[_optimized] scheme.
4570 This function generates the following code at the loop prolog:
4572 p = initial_addr;
4573 x msq_init = *(floor(p)); # prolog load
4574 realignment_token = call target_builtin;
4575 loop:
4576 x msq = phi (msq_init, ---)
4578 The stmts marked with x are generated only for the case of
4579 dr_explicit_realign_optimized.
4581 The code above sets up a new (vector) pointer, pointing to the first
4582 location accessed by STMT, and a "floor-aligned" load using that pointer.
4583 It also generates code to compute the "realignment-token" (if the relevant
4584 target hook was defined), and creates a phi-node at the loop-header bb
4585 whose arguments are the result of the prolog-load (created by this
4586 function) and the result of a load that takes place in the loop (to be
4587 created by the caller to this function).
4589 For the case of dr_explicit_realign_optimized:
4590 The caller to this function uses the phi-result (msq) to create the
4591 realignment code inside the loop, and sets up the missing phi argument,
4592 as follows:
4593 loop:
4594 msq = phi (msq_init, lsq)
4595 lsq = *(floor(p')); # load in loop
4596 result = realign_load (msq, lsq, realignment_token);
4598 For the case of dr_explicit_realign:
4599 loop:
4600 msq = *(floor(p)); # load in loop
4601 p' = p + (VS-1);
4602 lsq = *(floor(p')); # load in loop
4603 result = realign_load (msq, lsq, realignment_token);
4605 Input:
4606 STMT - (scalar) load stmt to be vectorized. This load accesses
4607 a memory location that may be unaligned.
4608 BSI - place where new code is to be inserted.
4609 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4610 is used.
4612 Output:
4613 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4614 target hook, if defined.
4615 Return value - the result of the loop-header phi node. */
4617 tree
4618 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4619 tree *realignment_token,
4620 enum dr_alignment_support alignment_support_scheme,
4621 tree init_addr,
4622 struct loop **at_loop)
4624 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4625 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4626 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4627 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4628 struct loop *loop = NULL;
4629 edge pe = NULL;
4630 tree scalar_dest = gimple_assign_lhs (stmt);
4631 tree vec_dest;
4632 gimple inc;
4633 tree ptr;
4634 tree data_ref;
4635 gimple new_stmt;
4636 basic_block new_bb;
4637 tree msq_init = NULL_TREE;
4638 tree new_temp;
4639 gimple phi_stmt;
4640 tree msq = NULL_TREE;
4641 gimple_seq stmts = NULL;
4642 bool inv_p;
4643 bool compute_in_loop = false;
4644 bool nested_in_vect_loop = false;
4645 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4646 struct loop *loop_for_initial_load = NULL;
4648 if (loop_vinfo)
4650 loop = LOOP_VINFO_LOOP (loop_vinfo);
4651 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4654 gcc_assert (alignment_support_scheme == dr_explicit_realign
4655 || alignment_support_scheme == dr_explicit_realign_optimized);
4657 /* We need to generate three things:
4658 1. the misalignment computation
4659 2. the extra vector load (for the optimized realignment scheme).
4660 3. the phi node for the two vectors from which the realignment is
4661 done (for the optimized realignment scheme). */
4663 /* 1. Determine where to generate the misalignment computation.
4665 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4666 calculation will be generated by this function, outside the loop (in the
4667 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4668 caller, inside the loop.
4670 Background: If the misalignment remains fixed throughout the iterations of
4671 the loop, then both realignment schemes are applicable, and also the
4672 misalignment computation can be done outside LOOP. This is because we are
4673 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4674 are a multiple of VS (the Vector Size), and therefore the misalignment in
4675 different vectorized LOOP iterations is always the same.
4676 The problem arises only if the memory access is in an inner-loop nested
4677 inside LOOP, which is now being vectorized using outer-loop vectorization.
4678 This is the only case when the misalignment of the memory access may not
4679 remain fixed throughout the iterations of the inner-loop (as explained in
4680 detail in vect_supportable_dr_alignment). In this case, not only is the
4681 optimized realignment scheme not applicable, but also the misalignment
4682 computation (and generation of the realignment token that is passed to
4683 REALIGN_LOAD) have to be done inside the loop.
4685 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4686 or not, which in turn determines if the misalignment is computed inside
4687 the inner-loop, or outside LOOP. */
4689 if (init_addr != NULL_TREE || !loop_vinfo)
4691 compute_in_loop = true;
4692 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4696 /* 2. Determine where to generate the extra vector load.
4698 For the optimized realignment scheme, instead of generating two vector
4699 loads in each iteration, we generate a single extra vector load in the
4700 preheader of the loop, and in each iteration reuse the result of the
4701 vector load from the previous iteration. In case the memory access is in
4702 an inner-loop nested inside LOOP, which is now being vectorized using
4703 outer-loop vectorization, we need to determine whether this initial vector
4704 load should be generated at the preheader of the inner-loop, or can be
4705 generated at the preheader of LOOP. If the memory access has no evolution
4706 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4707 to be generated inside LOOP (in the preheader of the inner-loop). */
4709 if (nested_in_vect_loop)
4711 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4712 bool invariant_in_outerloop =
4713 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4714 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4716 else
4717 loop_for_initial_load = loop;
4718 if (at_loop)
4719 *at_loop = loop_for_initial_load;
4721 if (loop_for_initial_load)
4722 pe = loop_preheader_edge (loop_for_initial_load);
4724 /* 3. For the case of the optimized realignment, create the first vector
4725 load at the loop preheader. */
4727 if (alignment_support_scheme == dr_explicit_realign_optimized)
4729 /* Create msq_init = *(floor(p1)) in the loop preheader */
4731 gcc_assert (!compute_in_loop);
4732 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4733 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4734 NULL_TREE, &init_addr, NULL, &inc,
4735 true, &inv_p);
4736 new_temp = copy_ssa_name (ptr, NULL);
4737 new_stmt = gimple_build_assign_with_ops
4738 (BIT_AND_EXPR, new_temp, ptr,
4739 build_int_cst (TREE_TYPE (ptr),
4740 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4741 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4742 gcc_assert (!new_bb);
4743 data_ref
4744 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4745 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4746 new_stmt = gimple_build_assign (vec_dest, data_ref);
4747 new_temp = make_ssa_name (vec_dest, new_stmt);
4748 gimple_assign_set_lhs (new_stmt, new_temp);
4749 if (pe)
4751 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4752 gcc_assert (!new_bb);
4754 else
4755 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4757 msq_init = gimple_assign_lhs (new_stmt);
4760 /* 4. Create realignment token using a target builtin, if available.
4761 It is done either inside the containing loop, or before LOOP (as
4762 determined above). */
4764 if (targetm.vectorize.builtin_mask_for_load)
4766 tree builtin_decl;
4768 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4769 if (!init_addr)
4771 /* Generate the INIT_ADDR computation outside LOOP. */
4772 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4773 NULL_TREE, loop);
4774 if (loop)
4776 pe = loop_preheader_edge (loop);
4777 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4778 gcc_assert (!new_bb);
4780 else
4781 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4784 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4785 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4786 vec_dest =
4787 vect_create_destination_var (scalar_dest,
4788 gimple_call_return_type (new_stmt));
4789 new_temp = make_ssa_name (vec_dest, new_stmt);
4790 gimple_call_set_lhs (new_stmt, new_temp);
4792 if (compute_in_loop)
4793 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4794 else
4796 /* Generate the misalignment computation outside LOOP. */
4797 pe = loop_preheader_edge (loop);
4798 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4799 gcc_assert (!new_bb);
4802 *realignment_token = gimple_call_lhs (new_stmt);
4804 /* The result of the CALL_EXPR to this builtin is determined from
4805 the value of the parameter and no global variables are touched
4806 which makes the builtin a "const" function. Requiring the
4807 builtin to have the "const" attribute makes it unnecessary
4808 to call mark_call_clobbered. */
4809 gcc_assert (TREE_READONLY (builtin_decl));
4812 if (alignment_support_scheme == dr_explicit_realign)
4813 return msq;
4815 gcc_assert (!compute_in_loop);
4816 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4819 /* 5. Create msq = phi <msq_init, lsq> in loop */
4821 pe = loop_preheader_edge (containing_loop);
4822 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4823 msq = make_ssa_name (vec_dest, NULL);
4824 phi_stmt = create_phi_node (msq, containing_loop->header);
4825 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4827 return msq;
4831 /* Function vect_grouped_load_supported.
4833 Returns TRUE if even and odd permutations are supported,
4834 and FALSE otherwise. */
4836 bool
4837 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4839 enum machine_mode mode = TYPE_MODE (vectype);
4841 /* vect_permute_load_chain requires the group size to be a power of two. */
4842 if (exact_log2 (count) == -1)
4844 if (dump_enabled_p ())
4845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4846 "the size of the group of accesses"
4847 " is not a power of 2\n");
4848 return false;
4851 /* Check that the permutation is supported. */
4852 if (VECTOR_MODE_P (mode))
4854 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4855 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4857 for (i = 0; i < nelt; i++)
4858 sel[i] = i * 2;
4859 if (can_vec_perm_p (mode, false, sel))
4861 for (i = 0; i < nelt; i++)
4862 sel[i] = i * 2 + 1;
4863 if (can_vec_perm_p (mode, false, sel))
4864 return true;
4868 if (dump_enabled_p ())
4869 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4870 "extract even/odd not supported by target\n");
4871 return false;
4874 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4875 type VECTYPE. */
4877 bool
4878 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4880 return vect_lanes_optab_supported_p ("vec_load_lanes",
4881 vec_load_lanes_optab,
4882 vectype, count);
4885 /* Function vect_permute_load_chain.
4887 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4888 a power of 2, generate extract_even/odd stmts to reorder the input data
4889 correctly. Return the final references for loads in RESULT_CHAIN.
4891 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4892 The input is 4 vectors each containing 8 elements. We assign a number to each
4893 element, the input sequence is:
4895 1st vec: 0 1 2 3 4 5 6 7
4896 2nd vec: 8 9 10 11 12 13 14 15
4897 3rd vec: 16 17 18 19 20 21 22 23
4898 4th vec: 24 25 26 27 28 29 30 31
4900 The output sequence should be:
4902 1st vec: 0 4 8 12 16 20 24 28
4903 2nd vec: 1 5 9 13 17 21 25 29
4904 3rd vec: 2 6 10 14 18 22 26 30
4905 4th vec: 3 7 11 15 19 23 27 31
4907 i.e., the first output vector should contain the first elements of each
4908 interleaving group, etc.
4910 We use extract_even/odd instructions to create such output. The input of
4911 each extract_even/odd operation is two vectors
4912 1st vec 2nd vec
4913 0 1 2 3 4 5 6 7
4915 and the output is the vector of extracted even/odd elements. The output of
4916 extract_even will be: 0 2 4 6
4917 and of extract_odd: 1 3 5 7
4920 The permutation is done in log LENGTH stages. In each stage extract_even
4921 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4922 their order. In our example,
4924 E1: extract_even (1st vec, 2nd vec)
4925 E2: extract_odd (1st vec, 2nd vec)
4926 E3: extract_even (3rd vec, 4th vec)
4927 E4: extract_odd (3rd vec, 4th vec)
4929 The output for the first stage will be:
4931 E1: 0 2 4 6 8 10 12 14
4932 E2: 1 3 5 7 9 11 13 15
4933 E3: 16 18 20 22 24 26 28 30
4934 E4: 17 19 21 23 25 27 29 31
4936 In order to proceed and create the correct sequence for the next stage (or
4937 for the correct output, if the second stage is the last one, as in our
4938 example), we first put the output of extract_even operation and then the
4939 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4940 The input for the second stage is:
4942 1st vec (E1): 0 2 4 6 8 10 12 14
4943 2nd vec (E3): 16 18 20 22 24 26 28 30
4944 3rd vec (E2): 1 3 5 7 9 11 13 15
4945 4th vec (E4): 17 19 21 23 25 27 29 31
4947 The output of the second stage:
4949 E1: 0 4 8 12 16 20 24 28
4950 E2: 2 6 10 14 18 22 26 30
4951 E3: 1 5 9 13 17 21 25 29
4952 E4: 3 7 11 15 19 23 27 31
4954 And RESULT_CHAIN after reordering:
4956 1st vec (E1): 0 4 8 12 16 20 24 28
4957 2nd vec (E3): 1 5 9 13 17 21 25 29
4958 3rd vec (E2): 2 6 10 14 18 22 26 30
4959 4th vec (E4): 3 7 11 15 19 23 27 31. */
4961 static void
4962 vect_permute_load_chain (vec<tree> dr_chain,
4963 unsigned int length,
4964 gimple stmt,
4965 gimple_stmt_iterator *gsi,
4966 vec<tree> *result_chain)
4968 tree data_ref, first_vect, second_vect;
4969 tree perm_mask_even, perm_mask_odd;
4970 gimple perm_stmt;
4971 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4972 unsigned int i, j, log_length = exact_log2 (length);
4973 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4974 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4976 result_chain->quick_grow (length);
4977 memcpy (result_chain->address (), dr_chain.address (),
4978 length * sizeof (tree));
4980 for (i = 0; i < nelt; ++i)
4981 sel[i] = i * 2;
4982 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4983 gcc_assert (perm_mask_even != NULL);
4985 for (i = 0; i < nelt; ++i)
4986 sel[i] = i * 2 + 1;
4987 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4988 gcc_assert (perm_mask_odd != NULL);
4990 for (i = 0; i < log_length; i++)
4992 for (j = 0; j < length; j += 2)
4994 first_vect = dr_chain[j];
4995 second_vect = dr_chain[j+1];
4997 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4998 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4999 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5000 first_vect, second_vect,
5001 perm_mask_even);
5002 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5003 (*result_chain)[j/2] = data_ref;
5005 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5006 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5007 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5008 first_vect, second_vect,
5009 perm_mask_odd);
5010 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5011 (*result_chain)[j/2+length/2] = data_ref;
5013 memcpy (dr_chain.address (), result_chain->address (),
5014 length * sizeof (tree));
5019 /* Function vect_transform_grouped_load.
5021 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5022 to perform their permutation and ascribe the result vectorized statements to
5023 the scalar statements.
5026 void
5027 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5028 gimple_stmt_iterator *gsi)
5030 vec<tree> result_chain = vNULL;
5032 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5033 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5034 vectors, that are ready for vector computation. */
5035 result_chain.create (size);
5036 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5037 vect_record_grouped_load_vectors (stmt, result_chain);
5038 result_chain.release ();
5041 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5042 generated as part of the vectorization of STMT. Assign the statement
5043 for each vector to the associated scalar statement. */
5045 void
5046 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5048 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5049 gimple next_stmt, new_stmt;
5050 unsigned int i, gap_count;
5051 tree tmp_data_ref;
5053 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5054 Since we scan the chain starting from it's first node, their order
5055 corresponds the order of data-refs in RESULT_CHAIN. */
5056 next_stmt = first_stmt;
5057 gap_count = 1;
5058 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5060 if (!next_stmt)
5061 break;
5063 /* Skip the gaps. Loads created for the gaps will be removed by dead
5064 code elimination pass later. No need to check for the first stmt in
5065 the group, since it always exists.
5066 GROUP_GAP is the number of steps in elements from the previous
5067 access (if there is no gap GROUP_GAP is 1). We skip loads that
5068 correspond to the gaps. */
5069 if (next_stmt != first_stmt
5070 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5072 gap_count++;
5073 continue;
5076 while (next_stmt)
5078 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5079 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5080 copies, and we put the new vector statement in the first available
5081 RELATED_STMT. */
5082 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5083 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5084 else
5086 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5088 gimple prev_stmt =
5089 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5090 gimple rel_stmt =
5091 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5092 while (rel_stmt)
5094 prev_stmt = rel_stmt;
5095 rel_stmt =
5096 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5099 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5100 new_stmt;
5104 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5105 gap_count = 1;
5106 /* If NEXT_STMT accesses the same DR as the previous statement,
5107 put the same TMP_DATA_REF as its vectorized statement; otherwise
5108 get the next data-ref from RESULT_CHAIN. */
5109 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5110 break;
5115 /* Function vect_force_dr_alignment_p.
5117 Returns whether the alignment of a DECL can be forced to be aligned
5118 on ALIGNMENT bit boundary. */
5120 bool
5121 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5123 if (TREE_CODE (decl) != VAR_DECL)
5124 return false;
5126 /* We cannot change alignment of common or external symbols as another
5127 translation unit may contain a definition with lower alignment.
5128 The rules of common symbol linking mean that the definition
5129 will override the common symbol. The same is true for constant
5130 pool entries which may be shared and are not properly merged
5131 by LTO. */
5132 if (DECL_EXTERNAL (decl)
5133 || DECL_COMMON (decl)
5134 || DECL_IN_CONSTANT_POOL (decl))
5135 return false;
5137 if (TREE_ASM_WRITTEN (decl))
5138 return false;
5140 /* Do not override the alignment as specified by the ABI when the used
5141 attribute is set. */
5142 if (DECL_PRESERVE_P (decl))
5143 return false;
5145 /* Do not override explicit alignment set by the user when an explicit
5146 section name is also used. This is a common idiom used by many
5147 software projects. */
5148 if (DECL_SECTION_NAME (decl) != NULL_TREE
5149 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
5150 return false;
5152 if (TREE_STATIC (decl))
5153 return (alignment <= MAX_OFILE_ALIGNMENT);
5154 else
5155 return (alignment <= MAX_STACK_ALIGNMENT);
5159 /* Return whether the data reference DR is supported with respect to its
5160 alignment.
5161 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5162 it is aligned, i.e., check if it is possible to vectorize it with different
5163 alignment. */
5165 enum dr_alignment_support
5166 vect_supportable_dr_alignment (struct data_reference *dr,
5167 bool check_aligned_accesses)
5169 gimple stmt = DR_STMT (dr);
5170 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5171 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5172 enum machine_mode mode = TYPE_MODE (vectype);
5173 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5174 struct loop *vect_loop = NULL;
5175 bool nested_in_vect_loop = false;
5177 if (aligned_access_p (dr) && !check_aligned_accesses)
5178 return dr_aligned;
5180 /* For now assume all conditional loads/stores support unaligned
5181 access without any special code. */
5182 if (is_gimple_call (stmt)
5183 && gimple_call_internal_p (stmt)
5184 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5185 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5186 return dr_unaligned_supported;
5188 if (loop_vinfo)
5190 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5191 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5194 /* Possibly unaligned access. */
5196 /* We can choose between using the implicit realignment scheme (generating
5197 a misaligned_move stmt) and the explicit realignment scheme (generating
5198 aligned loads with a REALIGN_LOAD). There are two variants to the
5199 explicit realignment scheme: optimized, and unoptimized.
5200 We can optimize the realignment only if the step between consecutive
5201 vector loads is equal to the vector size. Since the vector memory
5202 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5203 is guaranteed that the misalignment amount remains the same throughout the
5204 execution of the vectorized loop. Therefore, we can create the
5205 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5206 at the loop preheader.
5208 However, in the case of outer-loop vectorization, when vectorizing a
5209 memory access in the inner-loop nested within the LOOP that is now being
5210 vectorized, while it is guaranteed that the misalignment of the
5211 vectorized memory access will remain the same in different outer-loop
5212 iterations, it is *not* guaranteed that is will remain the same throughout
5213 the execution of the inner-loop. This is because the inner-loop advances
5214 with the original scalar step (and not in steps of VS). If the inner-loop
5215 step happens to be a multiple of VS, then the misalignment remains fixed
5216 and we can use the optimized realignment scheme. For example:
5218 for (i=0; i<N; i++)
5219 for (j=0; j<M; j++)
5220 s += a[i+j];
5222 When vectorizing the i-loop in the above example, the step between
5223 consecutive vector loads is 1, and so the misalignment does not remain
5224 fixed across the execution of the inner-loop, and the realignment cannot
5225 be optimized (as illustrated in the following pseudo vectorized loop):
5227 for (i=0; i<N; i+=4)
5228 for (j=0; j<M; j++){
5229 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5230 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5231 // (assuming that we start from an aligned address).
5234 We therefore have to use the unoptimized realignment scheme:
5236 for (i=0; i<N; i+=4)
5237 for (j=k; j<M; j+=4)
5238 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5239 // that the misalignment of the initial address is
5240 // 0).
5242 The loop can then be vectorized as follows:
5244 for (k=0; k<4; k++){
5245 rt = get_realignment_token (&vp[k]);
5246 for (i=0; i<N; i+=4){
5247 v1 = vp[i+k];
5248 for (j=k; j<M; j+=4){
5249 v2 = vp[i+j+VS-1];
5250 va = REALIGN_LOAD <v1,v2,rt>;
5251 vs += va;
5252 v1 = v2;
5255 } */
5257 if (DR_IS_READ (dr))
5259 bool is_packed = false;
5260 tree type = (TREE_TYPE (DR_REF (dr)));
5262 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5263 && (!targetm.vectorize.builtin_mask_for_load
5264 || targetm.vectorize.builtin_mask_for_load ()))
5266 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5267 if ((nested_in_vect_loop
5268 && (TREE_INT_CST_LOW (DR_STEP (dr))
5269 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5270 || !loop_vinfo)
5271 return dr_explicit_realign;
5272 else
5273 return dr_explicit_realign_optimized;
5275 if (!known_alignment_for_access_p (dr))
5276 is_packed = not_size_aligned (DR_REF (dr));
5278 if ((TYPE_USER_ALIGN (type) && !is_packed)
5279 || targetm.vectorize.
5280 support_vector_misalignment (mode, type,
5281 DR_MISALIGNMENT (dr), is_packed))
5282 /* Can't software pipeline the loads, but can at least do them. */
5283 return dr_unaligned_supported;
5285 else
5287 bool is_packed = false;
5288 tree type = (TREE_TYPE (DR_REF (dr)));
5290 if (!known_alignment_for_access_p (dr))
5291 is_packed = not_size_aligned (DR_REF (dr));
5293 if ((TYPE_USER_ALIGN (type) && !is_packed)
5294 || targetm.vectorize.
5295 support_vector_misalignment (mode, type,
5296 DR_MISALIGNMENT (dr), is_packed))
5297 return dr_unaligned_supported;
5300 /* Unsupported. */
5301 return dr_unaligned_unsupported;