2014-01-30 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blobc3e8f372b839921de480db109da7daffd37de612
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2014 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "stor-layout.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-ssa-alias.h"
34 #include "internal-fn.h"
35 #include "tree-eh.h"
36 #include "gimple-expr.h"
37 #include "is-a.h"
38 #include "gimple.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "gimple-ssa.h"
43 #include "tree-phinodes.h"
44 #include "ssa-iterators.h"
45 #include "stringpool.h"
46 #include "tree-ssanames.h"
47 #include "tree-ssa-loop-ivopts.h"
48 #include "tree-ssa-loop-manip.h"
49 #include "tree-ssa-loop.h"
50 #include "dumpfile.h"
51 #include "cfgloop.h"
52 #include "tree-chrec.h"
53 #include "tree-scalar-evolution.h"
54 #include "tree-vectorizer.h"
55 #include "diagnostic-core.h"
56 #include "cgraph.h"
57 /* Need to include rtl.h, expr.h, etc. for optabs. */
58 #include "expr.h"
59 #include "optabs.h"
61 /* Return true if load- or store-lanes optab OPTAB is implemented for
62 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
64 static bool
65 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
66 tree vectype, unsigned HOST_WIDE_INT count)
68 enum machine_mode mode, array_mode;
69 bool limit_p;
71 mode = TYPE_MODE (vectype);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
74 MODE_INT, limit_p);
76 if (array_mode == BLKmode)
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
80 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
81 GET_MODE_NAME (mode), count);
82 return false;
85 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
89 "cannot use %s<%s><%s>\n", name,
90 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
91 return false;
94 if (dump_enabled_p ())
95 dump_printf_loc (MSG_NOTE, vect_location,
96 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
97 GET_MODE_NAME (mode));
99 return true;
103 /* Return the smallest scalar part of STMT.
104 This is used to determine the vectype of the stmt. We generally set the
105 vectype according to the type of the result (lhs). For stmts whose
106 result-type is different than the type of the arguments (e.g., demotion,
107 promotion), vectype will be reset appropriately (later). Note that we have
108 to visit the smallest datatype in this function, because that determines the
109 VF. If the smallest datatype in the loop is present only as the rhs of a
110 promotion operation - we'd miss it.
111 Such a case, where a variable of this datatype does not appear in the lhs
112 anywhere in the loop, can only occur if it's an invariant: e.g.:
113 'int_x = (int) short_inv', which we'd expect to have been optimized away by
114 invariant motion. However, we cannot rely on invariant motion to always
115 take invariants out of the loop, and so in the case of promotion we also
116 have to check the rhs.
117 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
118 types. */
120 tree
121 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
122 HOST_WIDE_INT *rhs_size_unit)
124 tree scalar_type = gimple_expr_type (stmt);
125 HOST_WIDE_INT lhs, rhs;
127 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
129 if (is_gimple_assign (stmt)
130 && (gimple_assign_cast_p (stmt)
131 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
132 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
133 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
135 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
137 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
138 if (rhs < lhs)
139 scalar_type = rhs_type;
142 *lhs_size_unit = lhs;
143 *rhs_size_unit = rhs;
144 return scalar_type;
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149 tested at run-time. Return TRUE if DDR was successfully inserted.
150 Return false if versioning is not supported. */
152 static bool
153 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
155 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
157 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
158 return false;
160 if (dump_enabled_p ())
162 dump_printf_loc (MSG_NOTE, vect_location,
163 "mark for run-time aliasing test between ");
164 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
165 dump_printf (MSG_NOTE, " and ");
166 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
167 dump_printf (MSG_NOTE, "\n");
170 if (optimize_loop_nest_for_size_p (loop))
172 if (dump_enabled_p ())
173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
174 "versioning not supported when optimizing"
175 " for size.\n");
176 return false;
179 /* FORNOW: We don't support versioning with outer-loop vectorization. */
180 if (loop->inner)
182 if (dump_enabled_p ())
183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
184 "versioning not yet supported for outer-loops.\n");
185 return false;
188 /* FORNOW: We don't support creating runtime alias tests for non-constant
189 step. */
190 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
191 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
193 if (dump_enabled_p ())
194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
195 "versioning not yet supported for non-constant "
196 "step\n");
197 return false;
200 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
201 return true;
205 /* Function vect_analyze_data_ref_dependence.
207 Return TRUE if there (might) exist a dependence between a memory-reference
208 DRA and a memory-reference DRB. When versioning for alias may check a
209 dependence at run-time, return FALSE. Adjust *MAX_VF according to
210 the data dependence. */
212 static bool
213 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
214 loop_vec_info loop_vinfo, int *max_vf)
216 unsigned int i;
217 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
218 struct data_reference *dra = DDR_A (ddr);
219 struct data_reference *drb = DDR_B (ddr);
220 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
221 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
222 lambda_vector dist_v;
223 unsigned int loop_depth;
225 /* In loop analysis all data references should be vectorizable. */
226 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
227 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
228 gcc_unreachable ();
230 /* Independent data accesses. */
231 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
232 return false;
234 if (dra == drb
235 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
236 return false;
238 /* Unknown data dependence. */
239 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
241 /* If user asserted safelen consecutive iterations can be
242 executed concurrently, assume independence. */
243 if (loop->safelen >= 2)
245 if (loop->safelen < *max_vf)
246 *max_vf = loop->safelen;
247 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
248 return false;
251 if (STMT_VINFO_GATHER_P (stmtinfo_a)
252 || STMT_VINFO_GATHER_P (stmtinfo_b))
254 if (dump_enabled_p ())
256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
257 "versioning for alias not supported for: "
258 "can't determine dependence between ");
259 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
260 DR_REF (dra));
261 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
262 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
263 DR_REF (drb));
264 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
266 return true;
269 if (dump_enabled_p ())
271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
272 "versioning for alias required: "
273 "can't determine dependence between ");
274 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
275 DR_REF (dra));
276 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
277 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
278 DR_REF (drb));
279 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
282 /* Add to list of ddrs that need to be tested at run-time. */
283 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
286 /* Known data dependence. */
287 if (DDR_NUM_DIST_VECTS (ddr) == 0)
289 /* If user asserted safelen consecutive iterations can be
290 executed concurrently, assume independence. */
291 if (loop->safelen >= 2)
293 if (loop->safelen < *max_vf)
294 *max_vf = loop->safelen;
295 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
296 return false;
299 if (STMT_VINFO_GATHER_P (stmtinfo_a)
300 || STMT_VINFO_GATHER_P (stmtinfo_b))
302 if (dump_enabled_p ())
304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
305 "versioning for alias not supported for: "
306 "bad dist vector for ");
307 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
308 DR_REF (dra));
309 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
310 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
311 DR_REF (drb));
312 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
314 return true;
317 if (dump_enabled_p ())
319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
320 "versioning for alias required: "
321 "bad dist vector for ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
323 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
324 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
325 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
327 /* Add to list of ddrs that need to be tested at run-time. */
328 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
331 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
332 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
334 int dist = dist_v[loop_depth];
336 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location,
338 "dependence distance = %d.\n", dist);
340 if (dist == 0)
342 if (dump_enabled_p ())
344 dump_printf_loc (MSG_NOTE, vect_location,
345 "dependence distance == 0 between ");
346 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
347 dump_printf (MSG_NOTE, " and ");
348 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
349 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
352 /* When we perform grouped accesses and perform implicit CSE
353 by detecting equal accesses and doing disambiguation with
354 runtime alias tests like for
355 .. = a[i];
356 .. = a[i+1];
357 a[i] = ..;
358 a[i+1] = ..;
359 *p = ..;
360 .. = a[i];
361 .. = a[i+1];
362 where we will end up loading { a[i], a[i+1] } once, make
363 sure that inserting group loads before the first load and
364 stores after the last store will do the right thing. */
365 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
366 && GROUP_SAME_DR_STMT (stmtinfo_a))
367 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
368 && GROUP_SAME_DR_STMT (stmtinfo_b)))
370 gimple earlier_stmt;
371 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
372 if (DR_IS_WRITE
373 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
375 if (dump_enabled_p ())
376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
377 "READ_WRITE dependence in interleaving."
378 "\n");
379 return true;
383 continue;
386 if (dist > 0 && DDR_REVERSED_P (ddr))
388 /* If DDR_REVERSED_P the order of the data-refs in DDR was
389 reversed (to make distance vector positive), and the actual
390 distance is negative. */
391 if (dump_enabled_p ())
392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
393 "dependence distance negative.\n");
394 continue;
397 if (abs (dist) >= 2
398 && abs (dist) < *max_vf)
400 /* The dependence distance requires reduction of the maximal
401 vectorization factor. */
402 *max_vf = abs (dist);
403 if (dump_enabled_p ())
404 dump_printf_loc (MSG_NOTE, vect_location,
405 "adjusting maximal vectorization factor to %i\n",
406 *max_vf);
409 if (abs (dist) >= *max_vf)
411 /* Dependence distance does not create dependence, as far as
412 vectorization is concerned, in this case. */
413 if (dump_enabled_p ())
414 dump_printf_loc (MSG_NOTE, vect_location,
415 "dependence distance >= VF.\n");
416 continue;
419 if (dump_enabled_p ())
421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
422 "not vectorized, possible dependence "
423 "between data-refs ");
424 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
425 dump_printf (MSG_NOTE, " and ");
426 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
427 dump_printf (MSG_NOTE, "\n");
430 return true;
433 return false;
436 /* Function vect_analyze_data_ref_dependences.
438 Examine all the data references in the loop, and make sure there do not
439 exist any data dependences between them. Set *MAX_VF according to
440 the maximum vectorization factor the data dependences allow. */
442 bool
443 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
445 unsigned int i;
446 struct data_dependence_relation *ddr;
448 if (dump_enabled_p ())
449 dump_printf_loc (MSG_NOTE, vect_location,
450 "=== vect_analyze_data_ref_dependences ===\n");
452 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
453 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
454 &LOOP_VINFO_DDRS (loop_vinfo),
455 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
456 return false;
458 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
459 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
460 return false;
462 return true;
466 /* Function vect_slp_analyze_data_ref_dependence.
468 Return TRUE if there (might) exist a dependence between a memory-reference
469 DRA and a memory-reference DRB. When versioning for alias may check a
470 dependence at run-time, return FALSE. Adjust *MAX_VF according to
471 the data dependence. */
473 static bool
474 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
476 struct data_reference *dra = DDR_A (ddr);
477 struct data_reference *drb = DDR_B (ddr);
479 /* We need to check dependences of statements marked as unvectorizable
480 as well, they still can prohibit vectorization. */
482 /* Independent data accesses. */
483 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
484 return false;
486 if (dra == drb)
487 return false;
489 /* Read-read is OK. */
490 if (DR_IS_READ (dra) && DR_IS_READ (drb))
491 return false;
493 /* If dra and drb are part of the same interleaving chain consider
494 them independent. */
495 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
496 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
497 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
498 return false;
500 /* Unknown data dependence. */
501 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
503 if (dump_enabled_p ())
505 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
506 "can't determine dependence between ");
507 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
508 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
509 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
510 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
513 else if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE, vect_location,
516 "determined dependence between ");
517 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
518 dump_printf (MSG_NOTE, " and ");
519 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
520 dump_printf (MSG_NOTE, "\n");
523 /* We do not vectorize basic blocks with write-write dependencies. */
524 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
525 return true;
527 /* If we have a read-write dependence check that the load is before the store.
528 When we vectorize basic blocks, vector load can be only before
529 corresponding scalar load, and vector store can be only after its
530 corresponding scalar store. So the order of the acceses is preserved in
531 case the load is before the store. */
532 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
533 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
535 /* That only holds for load-store pairs taking part in vectorization. */
536 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
537 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
538 return false;
541 return true;
545 /* Function vect_analyze_data_ref_dependences.
547 Examine all the data references in the basic-block, and make sure there
548 do not exist any data dependences between them. Set *MAX_VF according to
549 the maximum vectorization factor the data dependences allow. */
551 bool
552 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
554 struct data_dependence_relation *ddr;
555 unsigned int i;
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE, vect_location,
559 "=== vect_slp_analyze_data_ref_dependences ===\n");
561 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
562 &BB_VINFO_DDRS (bb_vinfo),
563 vNULL, true))
564 return false;
566 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
567 if (vect_slp_analyze_data_ref_dependence (ddr))
568 return false;
570 return true;
574 /* Function vect_compute_data_ref_alignment
576 Compute the misalignment of the data reference DR.
578 Output:
579 1. If during the misalignment computation it is found that the data reference
580 cannot be vectorized then false is returned.
581 2. DR_MISALIGNMENT (DR) is defined.
583 FOR NOW: No analysis is actually performed. Misalignment is calculated
584 only for trivial cases. TODO. */
586 static bool
587 vect_compute_data_ref_alignment (struct data_reference *dr)
589 gimple stmt = DR_STMT (dr);
590 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
591 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
592 struct loop *loop = NULL;
593 tree ref = DR_REF (dr);
594 tree vectype;
595 tree base, base_addr;
596 bool base_aligned;
597 tree misalign;
598 tree aligned_to, alignment;
600 if (dump_enabled_p ())
601 dump_printf_loc (MSG_NOTE, vect_location,
602 "vect_compute_data_ref_alignment:\n");
604 if (loop_vinfo)
605 loop = LOOP_VINFO_LOOP (loop_vinfo);
607 /* Initialize misalignment to unknown. */
608 SET_DR_MISALIGNMENT (dr, -1);
610 /* Strided loads perform only component accesses, misalignment information
611 is irrelevant for them. */
612 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
613 return true;
615 misalign = DR_INIT (dr);
616 aligned_to = DR_ALIGNED_TO (dr);
617 base_addr = DR_BASE_ADDRESS (dr);
618 vectype = STMT_VINFO_VECTYPE (stmt_info);
620 /* In case the dataref is in an inner-loop of the loop that is being
621 vectorized (LOOP), we use the base and misalignment information
622 relative to the outer-loop (LOOP). This is ok only if the misalignment
623 stays the same throughout the execution of the inner-loop, which is why
624 we have to check that the stride of the dataref in the inner-loop evenly
625 divides by the vector size. */
626 if (loop && nested_in_vect_loop_p (loop, stmt))
628 tree step = DR_STEP (dr);
629 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
631 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
633 if (dump_enabled_p ())
634 dump_printf_loc (MSG_NOTE, vect_location,
635 "inner step divides the vector-size.\n");
636 misalign = STMT_VINFO_DR_INIT (stmt_info);
637 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
638 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
640 else
642 if (dump_enabled_p ())
643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
644 "inner step doesn't divide the vector-size.\n");
645 misalign = NULL_TREE;
649 /* Similarly, if we're doing basic-block vectorization, we can only use
650 base and misalignment information relative to an innermost loop if the
651 misalignment stays the same throughout the execution of the loop.
652 As above, this is the case if the stride of the dataref evenly divides
653 by the vector size. */
654 if (!loop)
656 tree step = DR_STEP (dr);
657 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
659 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
661 if (dump_enabled_p ())
662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
663 "SLP: step doesn't divide the vector-size.\n");
664 misalign = NULL_TREE;
668 base = build_fold_indirect_ref (base_addr);
669 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
671 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
672 || !misalign)
674 if (dump_enabled_p ())
676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
677 "Unknown alignment for access: ");
678 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
679 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
681 return true;
684 if ((DECL_P (base)
685 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
686 alignment) >= 0)
687 || (TREE_CODE (base_addr) == SSA_NAME
688 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
689 TREE_TYPE (base_addr)))),
690 alignment) >= 0)
691 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
692 base_aligned = true;
693 else
694 base_aligned = false;
696 if (!base_aligned)
698 /* Do not change the alignment of global variables here if
699 flag_section_anchors is enabled as we already generated
700 RTL for other functions. Most global variables should
701 have been aligned during the IPA increase_alignment pass. */
702 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
703 || (TREE_STATIC (base) && flag_section_anchors))
705 if (dump_enabled_p ())
707 dump_printf_loc (MSG_NOTE, vect_location,
708 "can't force alignment of ref: ");
709 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
710 dump_printf (MSG_NOTE, "\n");
712 return true;
715 /* Force the alignment of the decl.
716 NOTE: This is the only change to the code we make during
717 the analysis phase, before deciding to vectorize the loop. */
718 if (dump_enabled_p ())
720 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
721 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
722 dump_printf (MSG_NOTE, "\n");
725 ((dataref_aux *)dr->aux)->base_decl = base;
726 ((dataref_aux *)dr->aux)->base_misaligned = true;
729 /* If this is a backward running DR then first access in the larger
730 vectype actually is N-1 elements before the address in the DR.
731 Adjust misalign accordingly. */
732 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
734 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
735 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
736 otherwise we wouldn't be here. */
737 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
738 /* PLUS because DR_STEP was negative. */
739 misalign = size_binop (PLUS_EXPR, misalign, offset);
742 /* Modulo alignment. */
743 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
745 if (!tree_fits_uhwi_p (misalign))
747 /* Negative or overflowed misalignment value. */
748 if (dump_enabled_p ())
749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
750 "unexpected misalign value\n");
751 return false;
754 SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
756 if (dump_enabled_p ())
758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
759 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
760 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
761 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
764 return true;
768 /* Function vect_compute_data_refs_alignment
770 Compute the misalignment of data references in the loop.
771 Return FALSE if a data reference is found that cannot be vectorized. */
773 static bool
774 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
775 bb_vec_info bb_vinfo)
777 vec<data_reference_p> datarefs;
778 struct data_reference *dr;
779 unsigned int i;
781 if (loop_vinfo)
782 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
783 else
784 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
786 FOR_EACH_VEC_ELT (datarefs, i, dr)
787 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
788 && !vect_compute_data_ref_alignment (dr))
790 if (bb_vinfo)
792 /* Mark unsupported statement as unvectorizable. */
793 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
794 continue;
796 else
797 return false;
800 return true;
804 /* Function vect_update_misalignment_for_peel
806 DR - the data reference whose misalignment is to be adjusted.
807 DR_PEEL - the data reference whose misalignment is being made
808 zero in the vector loop by the peel.
809 NPEEL - the number of iterations in the peel loop if the misalignment
810 of DR_PEEL is known at compile time. */
812 static void
813 vect_update_misalignment_for_peel (struct data_reference *dr,
814 struct data_reference *dr_peel, int npeel)
816 unsigned int i;
817 vec<dr_p> same_align_drs;
818 struct data_reference *current_dr;
819 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
820 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
821 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
822 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
824 /* For interleaved data accesses the step in the loop must be multiplied by
825 the size of the interleaving group. */
826 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
827 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
828 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
829 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
831 /* It can be assumed that the data refs with the same alignment as dr_peel
832 are aligned in the vector loop. */
833 same_align_drs
834 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
835 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
837 if (current_dr != dr)
838 continue;
839 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
840 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
841 SET_DR_MISALIGNMENT (dr, 0);
842 return;
845 if (known_alignment_for_access_p (dr)
846 && known_alignment_for_access_p (dr_peel))
848 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
849 int misal = DR_MISALIGNMENT (dr);
850 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
851 misal += negative ? -npeel * dr_size : npeel * dr_size;
852 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
853 SET_DR_MISALIGNMENT (dr, misal);
854 return;
857 if (dump_enabled_p ())
858 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
859 SET_DR_MISALIGNMENT (dr, -1);
863 /* Function vect_verify_datarefs_alignment
865 Return TRUE if all data references in the loop can be
866 handled with respect to alignment. */
868 bool
869 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
871 vec<data_reference_p> datarefs;
872 struct data_reference *dr;
873 enum dr_alignment_support supportable_dr_alignment;
874 unsigned int i;
876 if (loop_vinfo)
877 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
878 else
879 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
881 FOR_EACH_VEC_ELT (datarefs, i, dr)
883 gimple stmt = DR_STMT (dr);
884 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
886 if (!STMT_VINFO_RELEVANT_P (stmt_info))
887 continue;
889 /* For interleaving, only the alignment of the first access matters.
890 Skip statements marked as not vectorizable. */
891 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
892 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
893 || !STMT_VINFO_VECTORIZABLE (stmt_info))
894 continue;
896 /* Strided loads perform only component accesses, alignment is
897 irrelevant for them. */
898 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
899 continue;
901 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
902 if (!supportable_dr_alignment)
904 if (dump_enabled_p ())
906 if (DR_IS_READ (dr))
907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
908 "not vectorized: unsupported unaligned load.");
909 else
910 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
911 "not vectorized: unsupported unaligned "
912 "store.");
914 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
915 DR_REF (dr));
916 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
918 return false;
920 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
921 dump_printf_loc (MSG_NOTE, vect_location,
922 "Vectorizing an unaligned access.\n");
924 return true;
927 /* Given an memory reference EXP return whether its alignment is less
928 than its size. */
930 static bool
931 not_size_aligned (tree exp)
933 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
934 return true;
936 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
937 > get_object_alignment (exp));
940 /* Function vector_alignment_reachable_p
942 Return true if vector alignment for DR is reachable by peeling
943 a few loop iterations. Return false otherwise. */
945 static bool
946 vector_alignment_reachable_p (struct data_reference *dr)
948 gimple stmt = DR_STMT (dr);
949 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
950 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
952 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
954 /* For interleaved access we peel only if number of iterations in
955 the prolog loop ({VF - misalignment}), is a multiple of the
956 number of the interleaved accesses. */
957 int elem_size, mis_in_elements;
958 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
960 /* FORNOW: handle only known alignment. */
961 if (!known_alignment_for_access_p (dr))
962 return false;
964 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
965 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
967 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
968 return false;
971 /* If misalignment is known at the compile time then allow peeling
972 only if natural alignment is reachable through peeling. */
973 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
975 HOST_WIDE_INT elmsize =
976 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
977 if (dump_enabled_p ())
979 dump_printf_loc (MSG_NOTE, vect_location,
980 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
981 dump_printf (MSG_NOTE,
982 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
984 if (DR_MISALIGNMENT (dr) % elmsize)
986 if (dump_enabled_p ())
987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
988 "data size does not divide the misalignment.\n");
989 return false;
993 if (!known_alignment_for_access_p (dr))
995 tree type = TREE_TYPE (DR_REF (dr));
996 bool is_packed = not_size_aligned (DR_REF (dr));
997 if (dump_enabled_p ())
998 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
999 "Unknown misalignment, is_packed = %d\n",is_packed);
1000 if ((TYPE_USER_ALIGN (type) && !is_packed)
1001 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1002 return true;
1003 else
1004 return false;
1007 return true;
1011 /* Calculate the cost of the memory access represented by DR. */
1013 static void
1014 vect_get_data_access_cost (struct data_reference *dr,
1015 unsigned int *inside_cost,
1016 unsigned int *outside_cost,
1017 stmt_vector_for_cost *body_cost_vec)
1019 gimple stmt = DR_STMT (dr);
1020 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1021 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1022 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1023 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1024 int ncopies = vf / nunits;
1026 if (DR_IS_READ (dr))
1027 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1028 NULL, body_cost_vec, false);
1029 else
1030 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1032 if (dump_enabled_p ())
1033 dump_printf_loc (MSG_NOTE, vect_location,
1034 "vect_get_data_access_cost: inside_cost = %d, "
1035 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1039 /* Insert DR into peeling hash table with NPEEL as key. */
1041 static void
1042 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1043 int npeel)
1045 struct _vect_peel_info elem, *slot;
1046 _vect_peel_info **new_slot;
1047 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1049 elem.npeel = npeel;
1050 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
1051 if (slot)
1052 slot->count++;
1053 else
1055 slot = XNEW (struct _vect_peel_info);
1056 slot->npeel = npeel;
1057 slot->dr = dr;
1058 slot->count = 1;
1059 new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
1060 *new_slot = slot;
1063 if (!supportable_dr_alignment
1064 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1065 slot->count += VECT_MAX_COST;
1069 /* Traverse peeling hash table to find peeling option that aligns maximum
1070 number of data accesses. */
1073 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1074 _vect_peel_extended_info *max)
1076 vect_peel_info elem = *slot;
1078 if (elem->count > max->peel_info.count
1079 || (elem->count == max->peel_info.count
1080 && max->peel_info.npeel > elem->npeel))
1082 max->peel_info.npeel = elem->npeel;
1083 max->peel_info.count = elem->count;
1084 max->peel_info.dr = elem->dr;
1087 return 1;
1091 /* Traverse peeling hash table and calculate cost for each peeling option.
1092 Find the one with the lowest cost. */
1095 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1096 _vect_peel_extended_info *min)
1098 vect_peel_info elem = *slot;
1099 int save_misalignment, dummy;
1100 unsigned int inside_cost = 0, outside_cost = 0, i;
1101 gimple stmt = DR_STMT (elem->dr);
1102 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1103 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1104 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1105 struct data_reference *dr;
1106 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1107 int single_iter_cost;
1109 prologue_cost_vec.create (2);
1110 body_cost_vec.create (2);
1111 epilogue_cost_vec.create (2);
1113 FOR_EACH_VEC_ELT (datarefs, i, dr)
1115 stmt = DR_STMT (dr);
1116 stmt_info = vinfo_for_stmt (stmt);
1117 /* For interleaving, only the alignment of the first access
1118 matters. */
1119 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1120 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1121 continue;
1123 save_misalignment = DR_MISALIGNMENT (dr);
1124 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1125 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1126 &body_cost_vec);
1127 SET_DR_MISALIGNMENT (dr, save_misalignment);
1130 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1131 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1132 &dummy, single_iter_cost,
1133 &prologue_cost_vec,
1134 &epilogue_cost_vec);
1136 /* Prologue and epilogue costs are added to the target model later.
1137 These costs depend only on the scalar iteration cost, the
1138 number of peeling iterations finally chosen, and the number of
1139 misaligned statements. So discard the information found here. */
1140 prologue_cost_vec.release ();
1141 epilogue_cost_vec.release ();
1143 if (inside_cost < min->inside_cost
1144 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1146 min->inside_cost = inside_cost;
1147 min->outside_cost = outside_cost;
1148 min->body_cost_vec.release ();
1149 min->body_cost_vec = body_cost_vec;
1150 min->peel_info.dr = elem->dr;
1151 min->peel_info.npeel = elem->npeel;
1153 else
1154 body_cost_vec.release ();
1156 return 1;
1160 /* Choose best peeling option by traversing peeling hash table and either
1161 choosing an option with the lowest cost (if cost model is enabled) or the
1162 option that aligns as many accesses as possible. */
1164 static struct data_reference *
1165 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1166 unsigned int *npeel,
1167 stmt_vector_for_cost *body_cost_vec)
1169 struct _vect_peel_extended_info res;
1171 res.peel_info.dr = NULL;
1172 res.body_cost_vec = stmt_vector_for_cost ();
1174 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1176 res.inside_cost = INT_MAX;
1177 res.outside_cost = INT_MAX;
1178 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1179 .traverse <_vect_peel_extended_info *,
1180 vect_peeling_hash_get_lowest_cost> (&res);
1182 else
1184 res.peel_info.count = 0;
1185 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1186 .traverse <_vect_peel_extended_info *,
1187 vect_peeling_hash_get_most_frequent> (&res);
1190 *npeel = res.peel_info.npeel;
1191 *body_cost_vec = res.body_cost_vec;
1192 return res.peel_info.dr;
1196 /* Function vect_enhance_data_refs_alignment
1198 This pass will use loop versioning and loop peeling in order to enhance
1199 the alignment of data references in the loop.
1201 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1202 original loop is to be vectorized. Any other loops that are created by
1203 the transformations performed in this pass - are not supposed to be
1204 vectorized. This restriction will be relaxed.
1206 This pass will require a cost model to guide it whether to apply peeling
1207 or versioning or a combination of the two. For example, the scheme that
1208 intel uses when given a loop with several memory accesses, is as follows:
1209 choose one memory access ('p') which alignment you want to force by doing
1210 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1211 other accesses are not necessarily aligned, or (2) use loop versioning to
1212 generate one loop in which all accesses are aligned, and another loop in
1213 which only 'p' is necessarily aligned.
1215 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1216 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1217 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1219 Devising a cost model is the most critical aspect of this work. It will
1220 guide us on which access to peel for, whether to use loop versioning, how
1221 many versions to create, etc. The cost model will probably consist of
1222 generic considerations as well as target specific considerations (on
1223 powerpc for example, misaligned stores are more painful than misaligned
1224 loads).
1226 Here are the general steps involved in alignment enhancements:
1228 -- original loop, before alignment analysis:
1229 for (i=0; i<N; i++){
1230 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1231 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1234 -- After vect_compute_data_refs_alignment:
1235 for (i=0; i<N; i++){
1236 x = q[i]; # DR_MISALIGNMENT(q) = 3
1237 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1240 -- Possibility 1: we do loop versioning:
1241 if (p is aligned) {
1242 for (i=0; i<N; i++){ # loop 1A
1243 x = q[i]; # DR_MISALIGNMENT(q) = 3
1244 p[i] = y; # DR_MISALIGNMENT(p) = 0
1247 else {
1248 for (i=0; i<N; i++){ # loop 1B
1249 x = q[i]; # DR_MISALIGNMENT(q) = 3
1250 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1254 -- Possibility 2: we do loop peeling:
1255 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1256 x = q[i];
1257 p[i] = y;
1259 for (i = 3; i < N; i++){ # loop 2A
1260 x = q[i]; # DR_MISALIGNMENT(q) = 0
1261 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1264 -- Possibility 3: combination of loop peeling and versioning:
1265 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1266 x = q[i];
1267 p[i] = y;
1269 if (p is aligned) {
1270 for (i = 3; i<N; i++){ # loop 3A
1271 x = q[i]; # DR_MISALIGNMENT(q) = 0
1272 p[i] = y; # DR_MISALIGNMENT(p) = 0
1275 else {
1276 for (i = 3; i<N; i++){ # loop 3B
1277 x = q[i]; # DR_MISALIGNMENT(q) = 0
1278 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1282 These loops are later passed to loop_transform to be vectorized. The
1283 vectorizer will use the alignment information to guide the transformation
1284 (whether to generate regular loads/stores, or with special handling for
1285 misalignment). */
1287 bool
1288 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1290 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1291 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1292 enum dr_alignment_support supportable_dr_alignment;
1293 struct data_reference *dr0 = NULL, *first_store = NULL;
1294 struct data_reference *dr;
1295 unsigned int i, j;
1296 bool do_peeling = false;
1297 bool do_versioning = false;
1298 bool stat;
1299 gimple stmt;
1300 stmt_vec_info stmt_info;
1301 unsigned int npeel = 0;
1302 bool all_misalignments_unknown = true;
1303 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1304 unsigned possible_npeel_number = 1;
1305 tree vectype;
1306 unsigned int nelements, mis, same_align_drs_max = 0;
1307 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1309 if (dump_enabled_p ())
1310 dump_printf_loc (MSG_NOTE, vect_location,
1311 "=== vect_enhance_data_refs_alignment ===\n");
1313 /* While cost model enhancements are expected in the future, the high level
1314 view of the code at this time is as follows:
1316 A) If there is a misaligned access then see if peeling to align
1317 this access can make all data references satisfy
1318 vect_supportable_dr_alignment. If so, update data structures
1319 as needed and return true.
1321 B) If peeling wasn't possible and there is a data reference with an
1322 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1323 then see if loop versioning checks can be used to make all data
1324 references satisfy vect_supportable_dr_alignment. If so, update
1325 data structures as needed and return true.
1327 C) If neither peeling nor versioning were successful then return false if
1328 any data reference does not satisfy vect_supportable_dr_alignment.
1330 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1332 Note, Possibility 3 above (which is peeling and versioning together) is not
1333 being done at this time. */
1335 /* (1) Peeling to force alignment. */
1337 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1338 Considerations:
1339 + How many accesses will become aligned due to the peeling
1340 - How many accesses will become unaligned due to the peeling,
1341 and the cost of misaligned accesses.
1342 - The cost of peeling (the extra runtime checks, the increase
1343 in code size). */
1345 FOR_EACH_VEC_ELT (datarefs, i, dr)
1347 stmt = DR_STMT (dr);
1348 stmt_info = vinfo_for_stmt (stmt);
1350 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1351 continue;
1353 /* For interleaving, only the alignment of the first access
1354 matters. */
1355 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1356 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1357 continue;
1359 /* For invariant accesses there is nothing to enhance. */
1360 if (integer_zerop (DR_STEP (dr)))
1361 continue;
1363 /* Strided loads perform only component accesses, alignment is
1364 irrelevant for them. */
1365 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1366 continue;
1368 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1369 do_peeling = vector_alignment_reachable_p (dr);
1370 if (do_peeling)
1372 if (known_alignment_for_access_p (dr))
1374 unsigned int npeel_tmp;
1375 bool negative = tree_int_cst_compare (DR_STEP (dr),
1376 size_zero_node) < 0;
1378 /* Save info about DR in the hash table. */
1379 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1380 LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
1382 vectype = STMT_VINFO_VECTYPE (stmt_info);
1383 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1384 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1385 TREE_TYPE (DR_REF (dr))));
1386 npeel_tmp = (negative
1387 ? (mis - nelements) : (nelements - mis))
1388 & (nelements - 1);
1390 /* For multiple types, it is possible that the bigger type access
1391 will have more than one peeling option. E.g., a loop with two
1392 types: one of size (vector size / 4), and the other one of
1393 size (vector size / 8). Vectorization factor will 8. If both
1394 access are misaligned by 3, the first one needs one scalar
1395 iteration to be aligned, and the second one needs 5. But the
1396 the first one will be aligned also by peeling 5 scalar
1397 iterations, and in that case both accesses will be aligned.
1398 Hence, except for the immediate peeling amount, we also want
1399 to try to add full vector size, while we don't exceed
1400 vectorization factor.
1401 We do this automtically for cost model, since we calculate cost
1402 for every peeling option. */
1403 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1404 possible_npeel_number = vf /nelements;
1406 /* Handle the aligned case. We may decide to align some other
1407 access, making DR unaligned. */
1408 if (DR_MISALIGNMENT (dr) == 0)
1410 npeel_tmp = 0;
1411 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1412 possible_npeel_number++;
1415 for (j = 0; j < possible_npeel_number; j++)
1417 gcc_assert (npeel_tmp <= vf);
1418 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1419 npeel_tmp += nelements;
1422 all_misalignments_unknown = false;
1423 /* Data-ref that was chosen for the case that all the
1424 misalignments are unknown is not relevant anymore, since we
1425 have a data-ref with known alignment. */
1426 dr0 = NULL;
1428 else
1430 /* If we don't know any misalignment values, we prefer
1431 peeling for data-ref that has the maximum number of data-refs
1432 with the same alignment, unless the target prefers to align
1433 stores over load. */
1434 if (all_misalignments_unknown)
1436 unsigned same_align_drs
1437 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1438 if (!dr0
1439 || same_align_drs_max < same_align_drs)
1441 same_align_drs_max = same_align_drs;
1442 dr0 = dr;
1444 /* For data-refs with the same number of related
1445 accesses prefer the one where the misalign
1446 computation will be invariant in the outermost loop. */
1447 else if (same_align_drs_max == same_align_drs)
1449 struct loop *ivloop0, *ivloop;
1450 ivloop0 = outermost_invariant_loop_for_expr
1451 (loop, DR_BASE_ADDRESS (dr0));
1452 ivloop = outermost_invariant_loop_for_expr
1453 (loop, DR_BASE_ADDRESS (dr));
1454 if ((ivloop && !ivloop0)
1455 || (ivloop && ivloop0
1456 && flow_loop_nested_p (ivloop, ivloop0)))
1457 dr0 = dr;
1460 if (!first_store && DR_IS_WRITE (dr))
1461 first_store = dr;
1464 /* If there are both known and unknown misaligned accesses in the
1465 loop, we choose peeling amount according to the known
1466 accesses. */
1467 if (!supportable_dr_alignment)
1469 dr0 = dr;
1470 if (!first_store && DR_IS_WRITE (dr))
1471 first_store = dr;
1475 else
1477 if (!aligned_access_p (dr))
1479 if (dump_enabled_p ())
1480 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1481 "vector alignment may not be reachable\n");
1482 break;
1487 /* Check if we can possibly peel the loop. */
1488 if (!vect_can_advance_ivs_p (loop_vinfo)
1489 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1490 do_peeling = false;
1492 if (do_peeling && all_misalignments_unknown
1493 && vect_supportable_dr_alignment (dr0, false))
1496 /* Check if the target requires to prefer stores over loads, i.e., if
1497 misaligned stores are more expensive than misaligned loads (taking
1498 drs with same alignment into account). */
1499 if (first_store && DR_IS_READ (dr0))
1501 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1502 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1503 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1504 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1505 stmt_vector_for_cost dummy;
1506 dummy.create (2);
1508 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1509 &dummy);
1510 vect_get_data_access_cost (first_store, &store_inside_cost,
1511 &store_outside_cost, &dummy);
1513 dummy.release ();
1515 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1516 aligning the load DR0). */
1517 load_inside_penalty = store_inside_cost;
1518 load_outside_penalty = store_outside_cost;
1519 for (i = 0;
1520 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1521 DR_STMT (first_store))).iterate (i, &dr);
1522 i++)
1523 if (DR_IS_READ (dr))
1525 load_inside_penalty += load_inside_cost;
1526 load_outside_penalty += load_outside_cost;
1528 else
1530 load_inside_penalty += store_inside_cost;
1531 load_outside_penalty += store_outside_cost;
1534 /* Calculate the penalty for leaving DR0 unaligned (by
1535 aligning the FIRST_STORE). */
1536 store_inside_penalty = load_inside_cost;
1537 store_outside_penalty = load_outside_cost;
1538 for (i = 0;
1539 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1540 DR_STMT (dr0))).iterate (i, &dr);
1541 i++)
1542 if (DR_IS_READ (dr))
1544 store_inside_penalty += load_inside_cost;
1545 store_outside_penalty += load_outside_cost;
1547 else
1549 store_inside_penalty += store_inside_cost;
1550 store_outside_penalty += store_outside_cost;
1553 if (load_inside_penalty > store_inside_penalty
1554 || (load_inside_penalty == store_inside_penalty
1555 && load_outside_penalty > store_outside_penalty))
1556 dr0 = first_store;
1559 /* In case there are only loads with different unknown misalignments, use
1560 peeling only if it may help to align other accesses in the loop. */
1561 if (!first_store
1562 && !STMT_VINFO_SAME_ALIGN_REFS (
1563 vinfo_for_stmt (DR_STMT (dr0))).length ()
1564 && vect_supportable_dr_alignment (dr0, false)
1565 != dr_unaligned_supported)
1566 do_peeling = false;
1569 if (do_peeling && !dr0)
1571 /* Peeling is possible, but there is no data access that is not supported
1572 unless aligned. So we try to choose the best possible peeling. */
1574 /* We should get here only if there are drs with known misalignment. */
1575 gcc_assert (!all_misalignments_unknown);
1577 /* Choose the best peeling from the hash table. */
1578 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1579 &body_cost_vec);
1580 if (!dr0 || !npeel)
1581 do_peeling = false;
1584 if (do_peeling)
1586 stmt = DR_STMT (dr0);
1587 stmt_info = vinfo_for_stmt (stmt);
1588 vectype = STMT_VINFO_VECTYPE (stmt_info);
1589 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1591 if (known_alignment_for_access_p (dr0))
1593 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1594 size_zero_node) < 0;
1595 if (!npeel)
1597 /* Since it's known at compile time, compute the number of
1598 iterations in the peeled loop (the peeling factor) for use in
1599 updating DR_MISALIGNMENT values. The peeling factor is the
1600 vectorization factor minus the misalignment as an element
1601 count. */
1602 mis = DR_MISALIGNMENT (dr0);
1603 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1604 npeel = ((negative ? mis - nelements : nelements - mis)
1605 & (nelements - 1));
1608 /* For interleaved data access every iteration accesses all the
1609 members of the group, therefore we divide the number of iterations
1610 by the group size. */
1611 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1612 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1613 npeel /= GROUP_SIZE (stmt_info);
1615 if (dump_enabled_p ())
1616 dump_printf_loc (MSG_NOTE, vect_location,
1617 "Try peeling by %d\n", npeel);
1620 /* Ensure that all data refs can be vectorized after the peel. */
1621 FOR_EACH_VEC_ELT (datarefs, i, dr)
1623 int save_misalignment;
1625 if (dr == dr0)
1626 continue;
1628 stmt = DR_STMT (dr);
1629 stmt_info = vinfo_for_stmt (stmt);
1630 /* For interleaving, only the alignment of the first access
1631 matters. */
1632 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1633 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1634 continue;
1636 /* Strided loads perform only component accesses, alignment is
1637 irrelevant for them. */
1638 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1639 continue;
1641 save_misalignment = DR_MISALIGNMENT (dr);
1642 vect_update_misalignment_for_peel (dr, dr0, npeel);
1643 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1644 SET_DR_MISALIGNMENT (dr, save_misalignment);
1646 if (!supportable_dr_alignment)
1648 do_peeling = false;
1649 break;
1653 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1655 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1656 if (!stat)
1657 do_peeling = false;
1658 else
1660 body_cost_vec.release ();
1661 return stat;
1665 if (do_peeling)
1667 unsigned max_allowed_peel
1668 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1669 if (max_allowed_peel != (unsigned)-1)
1671 unsigned max_peel = npeel;
1672 if (max_peel == 0)
1674 gimple dr_stmt = DR_STMT (dr0);
1675 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1676 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1677 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1679 if (max_peel > max_allowed_peel)
1681 do_peeling = false;
1682 if (dump_enabled_p ())
1683 dump_printf_loc (MSG_NOTE, vect_location,
1684 "Disable peeling, max peels reached: %d\n", max_peel);
1689 if (do_peeling)
1691 stmt_info_for_cost *si;
1692 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1694 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1695 If the misalignment of DR_i is identical to that of dr0 then set
1696 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1697 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1698 by the peeling factor times the element size of DR_i (MOD the
1699 vectorization factor times the size). Otherwise, the
1700 misalignment of DR_i must be set to unknown. */
1701 FOR_EACH_VEC_ELT (datarefs, i, dr)
1702 if (dr != dr0)
1703 vect_update_misalignment_for_peel (dr, dr0, npeel);
1705 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1706 if (npeel)
1707 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1708 else
1709 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1710 = DR_MISALIGNMENT (dr0);
1711 SET_DR_MISALIGNMENT (dr0, 0);
1712 if (dump_enabled_p ())
1714 dump_printf_loc (MSG_NOTE, vect_location,
1715 "Alignment of access forced using peeling.\n");
1716 dump_printf_loc (MSG_NOTE, vect_location,
1717 "Peeling for alignment will be applied.\n");
1719 /* We've delayed passing the inside-loop peeling costs to the
1720 target cost model until we were sure peeling would happen.
1721 Do so now. */
1722 if (body_cost_vec.exists ())
1724 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1726 struct _stmt_vec_info *stmt_info
1727 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1728 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1729 si->misalign, vect_body);
1731 body_cost_vec.release ();
1734 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1735 gcc_assert (stat);
1736 return stat;
1740 body_cost_vec.release ();
1742 /* (2) Versioning to force alignment. */
1744 /* Try versioning if:
1745 1) optimize loop for speed
1746 2) there is at least one unsupported misaligned data ref with an unknown
1747 misalignment, and
1748 3) all misaligned data refs with a known misalignment are supported, and
1749 4) the number of runtime alignment checks is within reason. */
1751 do_versioning =
1752 optimize_loop_nest_for_speed_p (loop)
1753 && (!loop->inner); /* FORNOW */
1755 if (do_versioning)
1757 FOR_EACH_VEC_ELT (datarefs, i, dr)
1759 stmt = DR_STMT (dr);
1760 stmt_info = vinfo_for_stmt (stmt);
1762 /* For interleaving, only the alignment of the first access
1763 matters. */
1764 if (aligned_access_p (dr)
1765 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1766 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1767 continue;
1769 /* Strided loads perform only component accesses, alignment is
1770 irrelevant for them. */
1771 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1772 continue;
1774 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1776 if (!supportable_dr_alignment)
1778 gimple stmt;
1779 int mask;
1780 tree vectype;
1782 if (known_alignment_for_access_p (dr)
1783 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1784 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1786 do_versioning = false;
1787 break;
1790 stmt = DR_STMT (dr);
1791 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1792 gcc_assert (vectype);
1794 /* The rightmost bits of an aligned address must be zeros.
1795 Construct the mask needed for this test. For example,
1796 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1797 mask must be 15 = 0xf. */
1798 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1800 /* FORNOW: use the same mask to test all potentially unaligned
1801 references in the loop. The vectorizer currently supports
1802 a single vector size, see the reference to
1803 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1804 vectorization factor is computed. */
1805 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1806 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1807 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1808 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1809 DR_STMT (dr));
1813 /* Versioning requires at least one misaligned data reference. */
1814 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1815 do_versioning = false;
1816 else if (!do_versioning)
1817 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1820 if (do_versioning)
1822 vec<gimple> may_misalign_stmts
1823 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1824 gimple stmt;
1826 /* It can now be assumed that the data references in the statements
1827 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1828 of the loop being vectorized. */
1829 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1831 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1832 dr = STMT_VINFO_DATA_REF (stmt_info);
1833 SET_DR_MISALIGNMENT (dr, 0);
1834 if (dump_enabled_p ())
1835 dump_printf_loc (MSG_NOTE, vect_location,
1836 "Alignment of access forced using versioning.\n");
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_NOTE, vect_location,
1841 "Versioning for alignment will be applied.\n");
1843 /* Peeling and versioning can't be done together at this time. */
1844 gcc_assert (! (do_peeling && do_versioning));
1846 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1847 gcc_assert (stat);
1848 return stat;
1851 /* This point is reached if neither peeling nor versioning is being done. */
1852 gcc_assert (! (do_peeling || do_versioning));
1854 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1855 return stat;
1859 /* Function vect_find_same_alignment_drs.
1861 Update group and alignment relations according to the chosen
1862 vectorization factor. */
1864 static void
1865 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1866 loop_vec_info loop_vinfo)
1868 unsigned int i;
1869 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1870 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1871 struct data_reference *dra = DDR_A (ddr);
1872 struct data_reference *drb = DDR_B (ddr);
1873 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1874 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1875 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1876 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1877 lambda_vector dist_v;
1878 unsigned int loop_depth;
1880 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1881 return;
1883 if (dra == drb)
1884 return;
1886 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1887 return;
1889 /* Loop-based vectorization and known data dependence. */
1890 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1891 return;
1893 /* Data-dependence analysis reports a distance vector of zero
1894 for data-references that overlap only in the first iteration
1895 but have different sign step (see PR45764).
1896 So as a sanity check require equal DR_STEP. */
1897 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1898 return;
1900 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1901 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1903 int dist = dist_v[loop_depth];
1905 if (dump_enabled_p ())
1906 dump_printf_loc (MSG_NOTE, vect_location,
1907 "dependence distance = %d.\n", dist);
1909 /* Same loop iteration. */
1910 if (dist == 0
1911 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1913 /* Two references with distance zero have the same alignment. */
1914 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1915 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1916 if (dump_enabled_p ())
1918 dump_printf_loc (MSG_NOTE, vect_location,
1919 "accesses have the same alignment.\n");
1920 dump_printf (MSG_NOTE,
1921 "dependence distance modulo vf == 0 between ");
1922 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1923 dump_printf (MSG_NOTE, " and ");
1924 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1925 dump_printf (MSG_NOTE, "\n");
1932 /* Function vect_analyze_data_refs_alignment
1934 Analyze the alignment of the data-references in the loop.
1935 Return FALSE if a data reference is found that cannot be vectorized. */
1937 bool
1938 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1939 bb_vec_info bb_vinfo)
1941 if (dump_enabled_p ())
1942 dump_printf_loc (MSG_NOTE, vect_location,
1943 "=== vect_analyze_data_refs_alignment ===\n");
1945 /* Mark groups of data references with same alignment using
1946 data dependence information. */
1947 if (loop_vinfo)
1949 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1950 struct data_dependence_relation *ddr;
1951 unsigned int i;
1953 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1954 vect_find_same_alignment_drs (ddr, loop_vinfo);
1957 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1959 if (dump_enabled_p ())
1960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1961 "not vectorized: can't calculate alignment "
1962 "for data ref.\n");
1963 return false;
1966 return true;
1970 /* Analyze groups of accesses: check that DR belongs to a group of
1971 accesses of legal size, step, etc. Detect gaps, single element
1972 interleaving, and other special cases. Set grouped access info.
1973 Collect groups of strided stores for further use in SLP analysis. */
1975 static bool
1976 vect_analyze_group_access (struct data_reference *dr)
1978 tree step = DR_STEP (dr);
1979 tree scalar_type = TREE_TYPE (DR_REF (dr));
1980 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
1981 gimple stmt = DR_STMT (dr);
1982 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1983 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1984 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
1985 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
1986 HOST_WIDE_INT groupsize, last_accessed_element = 1;
1987 bool slp_impossible = false;
1988 struct loop *loop = NULL;
1990 if (loop_vinfo)
1991 loop = LOOP_VINFO_LOOP (loop_vinfo);
1993 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
1994 size of the interleaving group (including gaps). */
1995 groupsize = absu_hwi (dr_step) / type_size;
1997 /* Not consecutive access is possible only if it is a part of interleaving. */
1998 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2000 /* Check if it this DR is a part of interleaving, and is a single
2001 element of the group that is accessed in the loop. */
2003 /* Gaps are supported only for loads. STEP must be a multiple of the type
2004 size. The size of the group must be a power of 2. */
2005 if (DR_IS_READ (dr)
2006 && (dr_step % type_size) == 0
2007 && groupsize > 0
2008 && exact_log2 (groupsize) != -1)
2010 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2011 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2012 if (dump_enabled_p ())
2014 dump_printf_loc (MSG_NOTE, vect_location,
2015 "Detected single element interleaving ");
2016 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2017 dump_printf (MSG_NOTE, " step ");
2018 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2019 dump_printf (MSG_NOTE, "\n");
2022 if (loop_vinfo)
2024 if (dump_enabled_p ())
2025 dump_printf_loc (MSG_NOTE, vect_location,
2026 "Data access with gaps requires scalar "
2027 "epilogue loop\n");
2028 if (loop->inner)
2030 if (dump_enabled_p ())
2031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2032 "Peeling for outer loop is not"
2033 " supported\n");
2034 return false;
2037 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2040 return true;
2043 if (dump_enabled_p ())
2045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2046 "not consecutive access ");
2047 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2048 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2051 if (bb_vinfo)
2053 /* Mark the statement as unvectorizable. */
2054 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2055 return true;
2058 return false;
2061 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2063 /* First stmt in the interleaving chain. Check the chain. */
2064 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2065 struct data_reference *data_ref = dr;
2066 unsigned int count = 1;
2067 tree prev_init = DR_INIT (data_ref);
2068 gimple prev = stmt;
2069 HOST_WIDE_INT diff, gaps = 0;
2070 unsigned HOST_WIDE_INT count_in_bytes;
2072 while (next)
2074 /* Skip same data-refs. In case that two or more stmts share
2075 data-ref (supported only for loads), we vectorize only the first
2076 stmt, and the rest get their vectorized loads from the first
2077 one. */
2078 if (!tree_int_cst_compare (DR_INIT (data_ref),
2079 DR_INIT (STMT_VINFO_DATA_REF (
2080 vinfo_for_stmt (next)))))
2082 if (DR_IS_WRITE (data_ref))
2084 if (dump_enabled_p ())
2085 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2086 "Two store stmts share the same dr.\n");
2087 return false;
2090 /* For load use the same data-ref load. */
2091 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2093 prev = next;
2094 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2095 continue;
2098 prev = next;
2099 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2101 /* All group members have the same STEP by construction. */
2102 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2104 /* Check that the distance between two accesses is equal to the type
2105 size. Otherwise, we have gaps. */
2106 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2107 - TREE_INT_CST_LOW (prev_init)) / type_size;
2108 if (diff != 1)
2110 /* FORNOW: SLP of accesses with gaps is not supported. */
2111 slp_impossible = true;
2112 if (DR_IS_WRITE (data_ref))
2114 if (dump_enabled_p ())
2115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2116 "interleaved store with gaps\n");
2117 return false;
2120 gaps += diff - 1;
2123 last_accessed_element += diff;
2125 /* Store the gap from the previous member of the group. If there is no
2126 gap in the access, GROUP_GAP is always 1. */
2127 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2129 prev_init = DR_INIT (data_ref);
2130 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2131 /* Count the number of data-refs in the chain. */
2132 count++;
2135 /* COUNT is the number of accesses found, we multiply it by the size of
2136 the type to get COUNT_IN_BYTES. */
2137 count_in_bytes = type_size * count;
2139 /* Check that the size of the interleaving (including gaps) is not
2140 greater than STEP. */
2141 if (dr_step != 0
2142 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2144 if (dump_enabled_p ())
2146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147 "interleaving size is greater than step for ");
2148 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2149 DR_REF (dr));
2150 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2152 return false;
2155 /* Check that the size of the interleaving is equal to STEP for stores,
2156 i.e., that there are no gaps. */
2157 if (dr_step != 0
2158 && absu_hwi (dr_step) != count_in_bytes)
2160 if (DR_IS_READ (dr))
2162 slp_impossible = true;
2163 /* There is a gap after the last load in the group. This gap is a
2164 difference between the groupsize and the number of elements.
2165 When there is no gap, this difference should be 0. */
2166 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2168 else
2170 if (dump_enabled_p ())
2171 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2172 "interleaved store with gaps\n");
2173 return false;
2177 /* Check that STEP is a multiple of type size. */
2178 if (dr_step != 0
2179 && (dr_step % type_size) != 0)
2181 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184 "step is not a multiple of type size: step ");
2185 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2186 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2187 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2188 TYPE_SIZE_UNIT (scalar_type));
2189 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2191 return false;
2194 if (groupsize == 0)
2195 groupsize = count;
2197 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2198 if (dump_enabled_p ())
2199 dump_printf_loc (MSG_NOTE, vect_location,
2200 "Detected interleaving of size %d\n", (int)groupsize);
2202 /* SLP: create an SLP data structure for every interleaving group of
2203 stores for further analysis in vect_analyse_slp. */
2204 if (DR_IS_WRITE (dr) && !slp_impossible)
2206 if (loop_vinfo)
2207 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2208 if (bb_vinfo)
2209 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2212 /* There is a gap in the end of the group. */
2213 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2215 if (dump_enabled_p ())
2216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2217 "Data access with gaps requires scalar "
2218 "epilogue loop\n");
2219 if (loop->inner)
2221 if (dump_enabled_p ())
2222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2223 "Peeling for outer loop is not supported\n");
2224 return false;
2227 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2231 return true;
2235 /* Analyze the access pattern of the data-reference DR.
2236 In case of non-consecutive accesses call vect_analyze_group_access() to
2237 analyze groups of accesses. */
2239 static bool
2240 vect_analyze_data_ref_access (struct data_reference *dr)
2242 tree step = DR_STEP (dr);
2243 tree scalar_type = TREE_TYPE (DR_REF (dr));
2244 gimple stmt = DR_STMT (dr);
2245 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2246 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2247 struct loop *loop = NULL;
2249 if (loop_vinfo)
2250 loop = LOOP_VINFO_LOOP (loop_vinfo);
2252 if (loop_vinfo && !step)
2254 if (dump_enabled_p ())
2255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2256 "bad data-ref access in loop\n");
2257 return false;
2260 /* Allow invariant loads in not nested loops. */
2261 if (loop_vinfo && integer_zerop (step))
2263 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2264 if (nested_in_vect_loop_p (loop, stmt))
2266 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_NOTE, vect_location,
2268 "zero step in inner loop of nest\n");
2269 return false;
2271 return DR_IS_READ (dr);
2274 if (loop && nested_in_vect_loop_p (loop, stmt))
2276 /* Interleaved accesses are not yet supported within outer-loop
2277 vectorization for references in the inner-loop. */
2278 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2280 /* For the rest of the analysis we use the outer-loop step. */
2281 step = STMT_VINFO_DR_STEP (stmt_info);
2282 if (integer_zerop (step))
2284 if (dump_enabled_p ())
2285 dump_printf_loc (MSG_NOTE, vect_location,
2286 "zero step in outer loop.\n");
2287 if (DR_IS_READ (dr))
2288 return true;
2289 else
2290 return false;
2294 /* Consecutive? */
2295 if (TREE_CODE (step) == INTEGER_CST)
2297 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2298 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2299 || (dr_step < 0
2300 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2302 /* Mark that it is not interleaving. */
2303 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2304 return true;
2308 if (loop && nested_in_vect_loop_p (loop, stmt))
2310 if (dump_enabled_p ())
2311 dump_printf_loc (MSG_NOTE, vect_location,
2312 "grouped access in outer loop.\n");
2313 return false;
2316 /* Assume this is a DR handled by non-constant strided load case. */
2317 if (TREE_CODE (step) != INTEGER_CST)
2318 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2320 /* Not consecutive access - check if it's a part of interleaving group. */
2321 return vect_analyze_group_access (dr);
2326 /* A helper function used in the comparator function to sort data
2327 references. T1 and T2 are two data references to be compared.
2328 The function returns -1, 0, or 1. */
2330 static int
2331 compare_tree (tree t1, tree t2)
2333 int i, cmp;
2334 enum tree_code code;
2335 char tclass;
2337 if (t1 == t2)
2338 return 0;
2339 if (t1 == NULL)
2340 return -1;
2341 if (t2 == NULL)
2342 return 1;
2345 if (TREE_CODE (t1) != TREE_CODE (t2))
2346 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2348 code = TREE_CODE (t1);
2349 switch (code)
2351 /* For const values, we can just use hash values for comparisons. */
2352 case INTEGER_CST:
2353 case REAL_CST:
2354 case FIXED_CST:
2355 case STRING_CST:
2356 case COMPLEX_CST:
2357 case VECTOR_CST:
2359 hashval_t h1 = iterative_hash_expr (t1, 0);
2360 hashval_t h2 = iterative_hash_expr (t2, 0);
2361 if (h1 != h2)
2362 return h1 < h2 ? -1 : 1;
2363 break;
2366 case SSA_NAME:
2367 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2368 if (cmp != 0)
2369 return cmp;
2371 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2372 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2373 break;
2375 default:
2376 tclass = TREE_CODE_CLASS (code);
2378 /* For var-decl, we could compare their UIDs. */
2379 if (tclass == tcc_declaration)
2381 if (DECL_UID (t1) != DECL_UID (t2))
2382 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2383 break;
2386 /* For expressions with operands, compare their operands recursively. */
2387 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2389 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2390 if (cmp != 0)
2391 return cmp;
2395 return 0;
2399 /* Compare two data-references DRA and DRB to group them into chunks
2400 suitable for grouping. */
2402 static int
2403 dr_group_sort_cmp (const void *dra_, const void *drb_)
2405 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2406 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2407 int cmp;
2409 /* Stabilize sort. */
2410 if (dra == drb)
2411 return 0;
2413 /* Ordering of DRs according to base. */
2414 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2416 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2417 if (cmp != 0)
2418 return cmp;
2421 /* And according to DR_OFFSET. */
2422 if (!dr_equal_offsets_p (dra, drb))
2424 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2425 if (cmp != 0)
2426 return cmp;
2429 /* Put reads before writes. */
2430 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2431 return DR_IS_READ (dra) ? -1 : 1;
2433 /* Then sort after access size. */
2434 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2435 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2437 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2438 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2439 if (cmp != 0)
2440 return cmp;
2443 /* And after step. */
2444 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2446 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2447 if (cmp != 0)
2448 return cmp;
2451 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2452 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2453 if (cmp == 0)
2454 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2455 return cmp;
2458 /* Function vect_analyze_data_ref_accesses.
2460 Analyze the access pattern of all the data references in the loop.
2462 FORNOW: the only access pattern that is considered vectorizable is a
2463 simple step 1 (consecutive) access.
2465 FORNOW: handle only arrays and pointer accesses. */
2467 bool
2468 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2470 unsigned int i;
2471 vec<data_reference_p> datarefs;
2472 struct data_reference *dr;
2474 if (dump_enabled_p ())
2475 dump_printf_loc (MSG_NOTE, vect_location,
2476 "=== vect_analyze_data_ref_accesses ===\n");
2478 if (loop_vinfo)
2479 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2480 else
2481 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2483 if (datarefs.is_empty ())
2484 return true;
2486 /* Sort the array of datarefs to make building the interleaving chains
2487 linear. Don't modify the original vector's order, it is needed for
2488 determining what dependencies are reversed. */
2489 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2490 qsort (datarefs_copy.address (), datarefs_copy.length (),
2491 sizeof (data_reference_p), dr_group_sort_cmp);
2493 /* Build the interleaving chains. */
2494 for (i = 0; i < datarefs_copy.length () - 1;)
2496 data_reference_p dra = datarefs_copy[i];
2497 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2498 stmt_vec_info lastinfo = NULL;
2499 for (i = i + 1; i < datarefs_copy.length (); ++i)
2501 data_reference_p drb = datarefs_copy[i];
2502 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2504 /* ??? Imperfect sorting (non-compatible types, non-modulo
2505 accesses, same accesses) can lead to a group to be artificially
2506 split here as we don't just skip over those. If it really
2507 matters we can push those to a worklist and re-iterate
2508 over them. The we can just skip ahead to the next DR here. */
2510 /* Check that the data-refs have same first location (except init)
2511 and they are both either store or load (not load and store). */
2512 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2513 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2514 DR_BASE_ADDRESS (drb), 0)
2515 || !dr_equal_offsets_p (dra, drb))
2516 break;
2518 /* Check that the data-refs have the same constant size and step. */
2519 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2520 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2521 if (!tree_fits_uhwi_p (sza)
2522 || !tree_fits_uhwi_p (szb)
2523 || !tree_int_cst_equal (sza, szb)
2524 || !tree_fits_shwi_p (DR_STEP (dra))
2525 || !tree_fits_shwi_p (DR_STEP (drb))
2526 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2527 break;
2529 /* Do not place the same access in the interleaving chain twice. */
2530 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2531 break;
2533 /* Check the types are compatible.
2534 ??? We don't distinguish this during sorting. */
2535 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2536 TREE_TYPE (DR_REF (drb))))
2537 break;
2539 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2540 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2541 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2542 gcc_assert (init_a < init_b);
2544 /* If init_b == init_a + the size of the type * k, we have an
2545 interleaving, and DRA is accessed before DRB. */
2546 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2547 if ((init_b - init_a) % type_size_a != 0)
2548 break;
2550 /* The step (if not zero) is greater than the difference between
2551 data-refs' inits. This splits groups into suitable sizes. */
2552 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2553 if (step != 0 && step <= (init_b - init_a))
2554 break;
2556 if (dump_enabled_p ())
2558 dump_printf_loc (MSG_NOTE, vect_location,
2559 "Detected interleaving ");
2560 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2561 dump_printf (MSG_NOTE, " and ");
2562 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2563 dump_printf (MSG_NOTE, "\n");
2566 /* Link the found element into the group list. */
2567 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2569 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2570 lastinfo = stmtinfo_a;
2572 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2573 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2574 lastinfo = stmtinfo_b;
2578 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2579 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2580 && !vect_analyze_data_ref_access (dr))
2582 if (dump_enabled_p ())
2583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2584 "not vectorized: complicated access pattern.\n");
2586 if (bb_vinfo)
2588 /* Mark the statement as not vectorizable. */
2589 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2590 continue;
2592 else
2594 datarefs_copy.release ();
2595 return false;
2599 datarefs_copy.release ();
2600 return true;
2604 /* Operator == between two dr_with_seg_len objects.
2606 This equality operator is used to make sure two data refs
2607 are the same one so that we will consider to combine the
2608 aliasing checks of those two pairs of data dependent data
2609 refs. */
2611 static bool
2612 operator == (const dr_with_seg_len& d1,
2613 const dr_with_seg_len& d2)
2615 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2616 DR_BASE_ADDRESS (d2.dr), 0)
2617 && compare_tree (d1.offset, d2.offset) == 0
2618 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2621 /* Function comp_dr_with_seg_len_pair.
2623 Comparison function for sorting objects of dr_with_seg_len_pair_t
2624 so that we can combine aliasing checks in one scan. */
2626 static int
2627 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2629 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2630 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2632 const dr_with_seg_len &p11 = p1->first,
2633 &p12 = p1->second,
2634 &p21 = p2->first,
2635 &p22 = p2->second;
2637 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2638 if a and c have the same basic address snd step, and b and d have the same
2639 address and step. Therefore, if any a&c or b&d don't have the same address
2640 and step, we don't care the order of those two pairs after sorting. */
2641 int comp_res;
2643 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2644 DR_BASE_ADDRESS (p21.dr))) != 0)
2645 return comp_res;
2646 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2647 DR_BASE_ADDRESS (p22.dr))) != 0)
2648 return comp_res;
2649 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2650 return comp_res;
2651 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2652 return comp_res;
2653 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2654 return comp_res;
2655 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2656 return comp_res;
2658 return 0;
2661 template <class T> static void
2662 swap (T& a, T& b)
2664 T c (a);
2665 a = b;
2666 b = c;
2669 /* Function vect_vfa_segment_size.
2671 Create an expression that computes the size of segment
2672 that will be accessed for a data reference. The functions takes into
2673 account that realignment loads may access one more vector.
2675 Input:
2676 DR: The data reference.
2677 LENGTH_FACTOR: segment length to consider.
2679 Return an expression whose value is the size of segment which will be
2680 accessed by DR. */
2682 static tree
2683 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2685 tree segment_length;
2687 if (integer_zerop (DR_STEP (dr)))
2688 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2689 else
2690 segment_length = size_binop (MULT_EXPR,
2691 fold_convert (sizetype, DR_STEP (dr)),
2692 fold_convert (sizetype, length_factor));
2694 if (vect_supportable_dr_alignment (dr, false)
2695 == dr_explicit_realign_optimized)
2697 tree vector_size = TYPE_SIZE_UNIT
2698 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2700 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2702 return segment_length;
2705 /* Function vect_prune_runtime_alias_test_list.
2707 Prune a list of ddrs to be tested at run-time by versioning for alias.
2708 Merge several alias checks into one if possible.
2709 Return FALSE if resulting list of ddrs is longer then allowed by
2710 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2712 bool
2713 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2715 vec<ddr_p> may_alias_ddrs =
2716 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2717 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2718 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2719 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2720 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2722 ddr_p ddr;
2723 unsigned int i;
2724 tree length_factor;
2726 if (dump_enabled_p ())
2727 dump_printf_loc (MSG_NOTE, vect_location,
2728 "=== vect_prune_runtime_alias_test_list ===\n");
2730 if (may_alias_ddrs.is_empty ())
2731 return true;
2733 /* Basically, for each pair of dependent data refs store_ptr_0
2734 and load_ptr_0, we create an expression:
2736 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2737 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2739 for aliasing checks. However, in some cases we can decrease
2740 the number of checks by combining two checks into one. For
2741 example, suppose we have another pair of data refs store_ptr_0
2742 and load_ptr_1, and if the following condition is satisfied:
2744 load_ptr_0 < load_ptr_1 &&
2745 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2747 (this condition means, in each iteration of vectorized loop,
2748 the accessed memory of store_ptr_0 cannot be between the memory
2749 of load_ptr_0 and load_ptr_1.)
2751 we then can use only the following expression to finish the
2752 alising checks between store_ptr_0 & load_ptr_0 and
2753 store_ptr_0 & load_ptr_1:
2755 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2756 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2758 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2759 same basic address. */
2761 comp_alias_ddrs.create (may_alias_ddrs.length ());
2763 /* First, we collect all data ref pairs for aliasing checks. */
2764 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2766 struct data_reference *dr_a, *dr_b;
2767 gimple dr_group_first_a, dr_group_first_b;
2768 tree segment_length_a, segment_length_b;
2769 gimple stmt_a, stmt_b;
2771 dr_a = DDR_A (ddr);
2772 stmt_a = DR_STMT (DDR_A (ddr));
2773 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2774 if (dr_group_first_a)
2776 stmt_a = dr_group_first_a;
2777 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2780 dr_b = DDR_B (ddr);
2781 stmt_b = DR_STMT (DDR_B (ddr));
2782 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2783 if (dr_group_first_b)
2785 stmt_b = dr_group_first_b;
2786 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2789 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2790 length_factor = scalar_loop_iters;
2791 else
2792 length_factor = size_int (vect_factor);
2793 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2794 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2796 dr_with_seg_len_pair_t dr_with_seg_len_pair
2797 (dr_with_seg_len (dr_a, segment_length_a),
2798 dr_with_seg_len (dr_b, segment_length_b));
2800 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2801 swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2803 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2806 /* Second, we sort the collected data ref pairs so that we can scan
2807 them once to combine all possible aliasing checks. */
2808 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2810 /* Third, we scan the sorted dr pairs and check if we can combine
2811 alias checks of two neighbouring dr pairs. */
2812 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2814 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2815 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2816 *dr_b1 = &comp_alias_ddrs[i-1].second,
2817 *dr_a2 = &comp_alias_ddrs[i].first,
2818 *dr_b2 = &comp_alias_ddrs[i].second;
2820 /* Remove duplicate data ref pairs. */
2821 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2823 if (dump_enabled_p ())
2825 dump_printf_loc (MSG_NOTE, vect_location,
2826 "found equal ranges ");
2827 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2828 DR_REF (dr_a1->dr));
2829 dump_printf (MSG_NOTE, ", ");
2830 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2831 DR_REF (dr_b1->dr));
2832 dump_printf (MSG_NOTE, " and ");
2833 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2834 DR_REF (dr_a2->dr));
2835 dump_printf (MSG_NOTE, ", ");
2836 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2837 DR_REF (dr_b2->dr));
2838 dump_printf (MSG_NOTE, "\n");
2841 comp_alias_ddrs.ordered_remove (i--);
2842 continue;
2845 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2847 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2848 and DR_A1 and DR_A2 are two consecutive memrefs. */
2849 if (*dr_a1 == *dr_a2)
2851 swap (dr_a1, dr_b1);
2852 swap (dr_a2, dr_b2);
2855 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2856 DR_BASE_ADDRESS (dr_a2->dr),
2858 || !tree_fits_shwi_p (dr_a1->offset)
2859 || !tree_fits_shwi_p (dr_a2->offset))
2860 continue;
2862 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2863 - tree_to_shwi (dr_a1->offset));
2866 /* Now we check if the following condition is satisfied:
2868 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2870 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2871 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2872 have to make a best estimation. We can get the minimum value
2873 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2874 then either of the following two conditions can guarantee the
2875 one above:
2877 1: DIFF <= MIN_SEG_LEN_B
2878 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2882 HOST_WIDE_INT
2883 min_seg_len_b = (TREE_CODE (dr_b1->seg_len) == INTEGER_CST) ?
2884 TREE_INT_CST_LOW (dr_b1->seg_len) :
2885 vect_factor;
2887 if (diff <= min_seg_len_b
2888 || (TREE_CODE (dr_a1->seg_len) == INTEGER_CST
2889 && diff - (HOST_WIDE_INT) TREE_INT_CST_LOW (dr_a1->seg_len) <
2890 min_seg_len_b))
2892 dr_a1->seg_len = size_binop (PLUS_EXPR,
2893 dr_a2->seg_len, size_int (diff));
2894 comp_alias_ddrs.ordered_remove (i--);
2899 if ((int) comp_alias_ddrs.length () >
2900 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2902 if (dump_enabled_p ())
2904 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2905 "disable versioning for alias - max number of "
2906 "generated checks exceeded.\n");
2909 return false;
2912 return true;
2915 /* Check whether a non-affine read in stmt is suitable for gather load
2916 and if so, return a builtin decl for that operation. */
2918 tree
2919 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2920 tree *offp, int *scalep)
2922 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2923 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2924 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2925 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2926 tree offtype = NULL_TREE;
2927 tree decl, base, off;
2928 enum machine_mode pmode;
2929 int punsignedp, pvolatilep;
2931 base = DR_REF (dr);
2932 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2933 see if we can use the def stmt of the address. */
2934 if (is_gimple_call (stmt)
2935 && gimple_call_internal_p (stmt)
2936 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2937 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2938 && TREE_CODE (base) == MEM_REF
2939 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2940 && integer_zerop (TREE_OPERAND (base, 1))
2941 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2943 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2944 if (is_gimple_assign (def_stmt)
2945 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
2946 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2949 /* The gather builtins need address of the form
2950 loop_invariant + vector * {1, 2, 4, 8}
2952 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2953 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2954 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2955 multiplications and additions in it. To get a vector, we need
2956 a single SSA_NAME that will be defined in the loop and will
2957 contain everything that is not loop invariant and that can be
2958 vectorized. The following code attempts to find such a preexistng
2959 SSA_NAME OFF and put the loop invariants into a tree BASE
2960 that can be gimplified before the loop. */
2961 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
2962 &pmode, &punsignedp, &pvolatilep, false);
2963 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2965 if (TREE_CODE (base) == MEM_REF)
2967 if (!integer_zerop (TREE_OPERAND (base, 1)))
2969 if (off == NULL_TREE)
2971 double_int moff = mem_ref_offset (base);
2972 off = double_int_to_tree (sizetype, moff);
2974 else
2975 off = size_binop (PLUS_EXPR, off,
2976 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2978 base = TREE_OPERAND (base, 0);
2980 else
2981 base = build_fold_addr_expr (base);
2983 if (off == NULL_TREE)
2984 off = size_zero_node;
2986 /* If base is not loop invariant, either off is 0, then we start with just
2987 the constant offset in the loop invariant BASE and continue with base
2988 as OFF, otherwise give up.
2989 We could handle that case by gimplifying the addition of base + off
2990 into some SSA_NAME and use that as off, but for now punt. */
2991 if (!expr_invariant_in_loop_p (loop, base))
2993 if (!integer_zerop (off))
2994 return NULL_TREE;
2995 off = base;
2996 base = size_int (pbitpos / BITS_PER_UNIT);
2998 /* Otherwise put base + constant offset into the loop invariant BASE
2999 and continue with OFF. */
3000 else
3002 base = fold_convert (sizetype, base);
3003 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3006 /* OFF at this point may be either a SSA_NAME or some tree expression
3007 from get_inner_reference. Try to peel off loop invariants from it
3008 into BASE as long as possible. */
3009 STRIP_NOPS (off);
3010 while (offtype == NULL_TREE)
3012 enum tree_code code;
3013 tree op0, op1, add = NULL_TREE;
3015 if (TREE_CODE (off) == SSA_NAME)
3017 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3019 if (expr_invariant_in_loop_p (loop, off))
3020 return NULL_TREE;
3022 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3023 break;
3025 op0 = gimple_assign_rhs1 (def_stmt);
3026 code = gimple_assign_rhs_code (def_stmt);
3027 op1 = gimple_assign_rhs2 (def_stmt);
3029 else
3031 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3032 return NULL_TREE;
3033 code = TREE_CODE (off);
3034 extract_ops_from_tree (off, &code, &op0, &op1);
3036 switch (code)
3038 case POINTER_PLUS_EXPR:
3039 case PLUS_EXPR:
3040 if (expr_invariant_in_loop_p (loop, op0))
3042 add = op0;
3043 off = op1;
3044 do_add:
3045 add = fold_convert (sizetype, add);
3046 if (scale != 1)
3047 add = size_binop (MULT_EXPR, add, size_int (scale));
3048 base = size_binop (PLUS_EXPR, base, add);
3049 continue;
3051 if (expr_invariant_in_loop_p (loop, op1))
3053 add = op1;
3054 off = op0;
3055 goto do_add;
3057 break;
3058 case MINUS_EXPR:
3059 if (expr_invariant_in_loop_p (loop, op1))
3061 add = fold_convert (sizetype, op1);
3062 add = size_binop (MINUS_EXPR, size_zero_node, add);
3063 off = op0;
3064 goto do_add;
3066 break;
3067 case MULT_EXPR:
3068 if (scale == 1 && tree_fits_shwi_p (op1))
3070 scale = tree_to_shwi (op1);
3071 off = op0;
3072 continue;
3074 break;
3075 case SSA_NAME:
3076 off = op0;
3077 continue;
3078 CASE_CONVERT:
3079 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3080 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3081 break;
3082 if (TYPE_PRECISION (TREE_TYPE (op0))
3083 == TYPE_PRECISION (TREE_TYPE (off)))
3085 off = op0;
3086 continue;
3088 if (TYPE_PRECISION (TREE_TYPE (op0))
3089 < TYPE_PRECISION (TREE_TYPE (off)))
3091 off = op0;
3092 offtype = TREE_TYPE (off);
3093 STRIP_NOPS (off);
3094 continue;
3096 break;
3097 default:
3098 break;
3100 break;
3103 /* If at the end OFF still isn't a SSA_NAME or isn't
3104 defined in the loop, punt. */
3105 if (TREE_CODE (off) != SSA_NAME
3106 || expr_invariant_in_loop_p (loop, off))
3107 return NULL_TREE;
3109 if (offtype == NULL_TREE)
3110 offtype = TREE_TYPE (off);
3112 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3113 offtype, scale);
3114 if (decl == NULL_TREE)
3115 return NULL_TREE;
3117 if (basep)
3118 *basep = base;
3119 if (offp)
3120 *offp = off;
3121 if (scalep)
3122 *scalep = scale;
3123 return decl;
3126 /* Function vect_analyze_data_refs.
3128 Find all the data references in the loop or basic block.
3130 The general structure of the analysis of data refs in the vectorizer is as
3131 follows:
3132 1- vect_analyze_data_refs(loop/bb): call
3133 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3134 in the loop/bb and their dependences.
3135 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3136 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3137 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3141 bool
3142 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3143 bb_vec_info bb_vinfo,
3144 int *min_vf)
3146 struct loop *loop = NULL;
3147 basic_block bb = NULL;
3148 unsigned int i;
3149 vec<data_reference_p> datarefs;
3150 struct data_reference *dr;
3151 tree scalar_type;
3153 if (dump_enabled_p ())
3154 dump_printf_loc (MSG_NOTE, vect_location,
3155 "=== vect_analyze_data_refs ===\n");
3157 if (loop_vinfo)
3159 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3161 loop = LOOP_VINFO_LOOP (loop_vinfo);
3162 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3163 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3165 if (dump_enabled_p ())
3166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3167 "not vectorized: loop contains function calls"
3168 " or data references that cannot be analyzed\n");
3169 return false;
3172 for (i = 0; i < loop->num_nodes; i++)
3174 gimple_stmt_iterator gsi;
3176 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3178 gimple stmt = gsi_stmt (gsi);
3179 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3181 if (is_gimple_call (stmt) && loop->safelen)
3183 tree fndecl = gimple_call_fndecl (stmt), op;
3184 if (fndecl != NULL_TREE)
3186 struct cgraph_node *node = cgraph_get_node (fndecl);
3187 if (node != NULL && node->simd_clones != NULL)
3189 unsigned int j, n = gimple_call_num_args (stmt);
3190 for (j = 0; j < n; j++)
3192 op = gimple_call_arg (stmt, j);
3193 if (DECL_P (op)
3194 || (REFERENCE_CLASS_P (op)
3195 && get_base_address (op)))
3196 break;
3198 op = gimple_call_lhs (stmt);
3199 /* Ignore #pragma omp declare simd functions
3200 if they don't have data references in the
3201 call stmt itself. */
3202 if (j == n
3203 && !(op
3204 && (DECL_P (op)
3205 || (REFERENCE_CLASS_P (op)
3206 && get_base_address (op)))))
3207 continue;
3211 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3212 if (dump_enabled_p ())
3213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3214 "not vectorized: loop contains function "
3215 "calls or data references that cannot "
3216 "be analyzed\n");
3217 return false;
3222 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3224 else
3226 gimple_stmt_iterator gsi;
3228 bb = BB_VINFO_BB (bb_vinfo);
3229 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3231 gimple stmt = gsi_stmt (gsi);
3232 if (!find_data_references_in_stmt (NULL, stmt,
3233 &BB_VINFO_DATAREFS (bb_vinfo)))
3235 /* Mark the rest of the basic-block as unvectorizable. */
3236 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3238 stmt = gsi_stmt (gsi);
3239 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3241 break;
3245 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3248 /* Go through the data-refs, check that the analysis succeeded. Update
3249 pointer from stmt_vec_info struct to DR and vectype. */
3251 FOR_EACH_VEC_ELT (datarefs, i, dr)
3253 gimple stmt;
3254 stmt_vec_info stmt_info;
3255 tree base, offset, init;
3256 bool gather = false;
3257 bool simd_lane_access = false;
3258 int vf;
3260 again:
3261 if (!dr || !DR_REF (dr))
3263 if (dump_enabled_p ())
3264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3265 "not vectorized: unhandled data-ref\n");
3266 return false;
3269 stmt = DR_STMT (dr);
3270 stmt_info = vinfo_for_stmt (stmt);
3272 /* Discard clobbers from the dataref vector. We will remove
3273 clobber stmts during vectorization. */
3274 if (gimple_clobber_p (stmt))
3276 if (i == datarefs.length () - 1)
3278 datarefs.pop ();
3279 break;
3281 datarefs[i] = datarefs.pop ();
3282 goto again;
3285 /* Check that analysis of the data-ref succeeded. */
3286 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3287 || !DR_STEP (dr))
3289 bool maybe_gather
3290 = DR_IS_READ (dr)
3291 && !TREE_THIS_VOLATILE (DR_REF (dr))
3292 && targetm.vectorize.builtin_gather != NULL;
3293 bool maybe_simd_lane_access
3294 = loop_vinfo && loop->simduid;
3296 /* If target supports vector gather loads, or if this might be
3297 a SIMD lane access, see if they can't be used. */
3298 if (loop_vinfo
3299 && (maybe_gather || maybe_simd_lane_access)
3300 && !nested_in_vect_loop_p (loop, stmt))
3302 struct data_reference *newdr
3303 = create_data_ref (NULL, loop_containing_stmt (stmt),
3304 DR_REF (dr), stmt, true);
3305 gcc_assert (newdr != NULL && DR_REF (newdr));
3306 if (DR_BASE_ADDRESS (newdr)
3307 && DR_OFFSET (newdr)
3308 && DR_INIT (newdr)
3309 && DR_STEP (newdr)
3310 && integer_zerop (DR_STEP (newdr)))
3312 if (maybe_simd_lane_access)
3314 tree off = DR_OFFSET (newdr);
3315 STRIP_NOPS (off);
3316 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3317 && TREE_CODE (off) == MULT_EXPR
3318 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3320 tree step = TREE_OPERAND (off, 1);
3321 off = TREE_OPERAND (off, 0);
3322 STRIP_NOPS (off);
3323 if (CONVERT_EXPR_P (off)
3324 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3325 0)))
3326 < TYPE_PRECISION (TREE_TYPE (off)))
3327 off = TREE_OPERAND (off, 0);
3328 if (TREE_CODE (off) == SSA_NAME)
3330 gimple def = SSA_NAME_DEF_STMT (off);
3331 tree reft = TREE_TYPE (DR_REF (newdr));
3332 if (is_gimple_call (def)
3333 && gimple_call_internal_p (def)
3334 && (gimple_call_internal_fn (def)
3335 == IFN_GOMP_SIMD_LANE))
3337 tree arg = gimple_call_arg (def, 0);
3338 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3339 arg = SSA_NAME_VAR (arg);
3340 if (arg == loop->simduid
3341 /* For now. */
3342 && tree_int_cst_equal
3343 (TYPE_SIZE_UNIT (reft),
3344 step))
3346 DR_OFFSET (newdr) = ssize_int (0);
3347 DR_STEP (newdr) = step;
3348 DR_ALIGNED_TO (newdr)
3349 = size_int (BIGGEST_ALIGNMENT);
3350 dr = newdr;
3351 simd_lane_access = true;
3357 if (!simd_lane_access && maybe_gather)
3359 dr = newdr;
3360 gather = true;
3363 if (!gather && !simd_lane_access)
3364 free_data_ref (newdr);
3367 if (!gather && !simd_lane_access)
3369 if (dump_enabled_p ())
3371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3372 "not vectorized: data ref analysis "
3373 "failed ");
3374 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3375 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3378 if (bb_vinfo)
3379 break;
3381 return false;
3385 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3387 if (dump_enabled_p ())
3388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3389 "not vectorized: base addr of dr is a "
3390 "constant\n");
3392 if (bb_vinfo)
3393 break;
3395 if (gather || simd_lane_access)
3396 free_data_ref (dr);
3397 return false;
3400 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3402 if (dump_enabled_p ())
3404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3405 "not vectorized: volatile type ");
3406 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3407 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3410 if (bb_vinfo)
3411 break;
3413 return false;
3416 if (stmt_can_throw_internal (stmt))
3418 if (dump_enabled_p ())
3420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3421 "not vectorized: statement can throw an "
3422 "exception ");
3423 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3424 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3427 if (bb_vinfo)
3428 break;
3430 if (gather || simd_lane_access)
3431 free_data_ref (dr);
3432 return false;
3435 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3436 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3438 if (dump_enabled_p ())
3440 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3441 "not vectorized: statement is bitfield "
3442 "access ");
3443 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3444 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3447 if (bb_vinfo)
3448 break;
3450 if (gather || simd_lane_access)
3451 free_data_ref (dr);
3452 return false;
3455 base = unshare_expr (DR_BASE_ADDRESS (dr));
3456 offset = unshare_expr (DR_OFFSET (dr));
3457 init = unshare_expr (DR_INIT (dr));
3459 if (is_gimple_call (stmt)
3460 && (!gimple_call_internal_p (stmt)
3461 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3462 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3464 if (dump_enabled_p ())
3466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3467 "not vectorized: dr in a call ");
3468 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3469 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3472 if (bb_vinfo)
3473 break;
3475 if (gather || simd_lane_access)
3476 free_data_ref (dr);
3477 return false;
3480 /* Update DR field in stmt_vec_info struct. */
3482 /* If the dataref is in an inner-loop of the loop that is considered for
3483 for vectorization, we also want to analyze the access relative to
3484 the outer-loop (DR contains information only relative to the
3485 inner-most enclosing loop). We do that by building a reference to the
3486 first location accessed by the inner-loop, and analyze it relative to
3487 the outer-loop. */
3488 if (loop && nested_in_vect_loop_p (loop, stmt))
3490 tree outer_step, outer_base, outer_init;
3491 HOST_WIDE_INT pbitsize, pbitpos;
3492 tree poffset;
3493 enum machine_mode pmode;
3494 int punsignedp, pvolatilep;
3495 affine_iv base_iv, offset_iv;
3496 tree dinit;
3498 /* Build a reference to the first location accessed by the
3499 inner-loop: *(BASE+INIT). (The first location is actually
3500 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3501 tree inner_base = build_fold_indirect_ref
3502 (fold_build_pointer_plus (base, init));
3504 if (dump_enabled_p ())
3506 dump_printf_loc (MSG_NOTE, vect_location,
3507 "analyze in outer-loop: ");
3508 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3509 dump_printf (MSG_NOTE, "\n");
3512 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3513 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3514 gcc_assert (outer_base != NULL_TREE);
3516 if (pbitpos % BITS_PER_UNIT != 0)
3518 if (dump_enabled_p ())
3519 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3520 "failed: bit offset alignment.\n");
3521 return false;
3524 outer_base = build_fold_addr_expr (outer_base);
3525 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3526 &base_iv, false))
3528 if (dump_enabled_p ())
3529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3530 "failed: evolution of base is not affine.\n");
3531 return false;
3534 if (offset)
3536 if (poffset)
3537 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3538 poffset);
3539 else
3540 poffset = offset;
3543 if (!poffset)
3545 offset_iv.base = ssize_int (0);
3546 offset_iv.step = ssize_int (0);
3548 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3549 &offset_iv, false))
3551 if (dump_enabled_p ())
3552 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3553 "evolution of offset is not affine.\n");
3554 return false;
3557 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3558 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3559 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3560 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3561 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3563 outer_step = size_binop (PLUS_EXPR,
3564 fold_convert (ssizetype, base_iv.step),
3565 fold_convert (ssizetype, offset_iv.step));
3567 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3568 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3569 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3570 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3571 STMT_VINFO_DR_OFFSET (stmt_info) =
3572 fold_convert (ssizetype, offset_iv.base);
3573 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3574 size_int (highest_pow2_factor (offset_iv.base));
3576 if (dump_enabled_p ())
3578 dump_printf_loc (MSG_NOTE, vect_location,
3579 "\touter base_address: ");
3580 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3581 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3582 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3583 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3584 STMT_VINFO_DR_OFFSET (stmt_info));
3585 dump_printf (MSG_NOTE,
3586 "\n\touter constant offset from base address: ");
3587 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3588 STMT_VINFO_DR_INIT (stmt_info));
3589 dump_printf (MSG_NOTE, "\n\touter step: ");
3590 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3591 STMT_VINFO_DR_STEP (stmt_info));
3592 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3593 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3594 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3595 dump_printf (MSG_NOTE, "\n");
3599 if (STMT_VINFO_DATA_REF (stmt_info))
3601 if (dump_enabled_p ())
3603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3604 "not vectorized: more than one data ref "
3605 "in stmt: ");
3606 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3607 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3610 if (bb_vinfo)
3611 break;
3613 if (gather || simd_lane_access)
3614 free_data_ref (dr);
3615 return false;
3618 STMT_VINFO_DATA_REF (stmt_info) = dr;
3619 if (simd_lane_access)
3621 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3622 datarefs[i] = dr;
3625 /* Set vectype for STMT. */
3626 scalar_type = TREE_TYPE (DR_REF (dr));
3627 STMT_VINFO_VECTYPE (stmt_info) =
3628 get_vectype_for_scalar_type (scalar_type);
3629 if (!STMT_VINFO_VECTYPE (stmt_info))
3631 if (dump_enabled_p ())
3633 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3634 "not vectorized: no vectype for stmt: ");
3635 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3636 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3637 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3638 scalar_type);
3639 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3642 if (bb_vinfo)
3643 break;
3645 if (gather || simd_lane_access)
3647 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3648 free_data_ref (dr);
3650 return false;
3652 else
3654 if (dump_enabled_p ())
3656 dump_printf_loc (MSG_NOTE, vect_location,
3657 "got vectype for stmt: ");
3658 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3659 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3660 STMT_VINFO_VECTYPE (stmt_info));
3661 dump_printf (MSG_NOTE, "\n");
3665 /* Adjust the minimal vectorization factor according to the
3666 vector type. */
3667 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3668 if (vf > *min_vf)
3669 *min_vf = vf;
3671 if (gather)
3673 tree off;
3675 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3676 if (gather
3677 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3678 gather = false;
3679 if (!gather)
3681 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3682 free_data_ref (dr);
3683 if (dump_enabled_p ())
3685 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3686 "not vectorized: not suitable for gather "
3687 "load ");
3688 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3689 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3691 return false;
3694 datarefs[i] = dr;
3695 STMT_VINFO_GATHER_P (stmt_info) = true;
3697 else if (loop_vinfo
3698 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3700 if (nested_in_vect_loop_p (loop, stmt)
3701 || !DR_IS_READ (dr))
3703 if (dump_enabled_p ())
3705 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3706 "not vectorized: not suitable for strided "
3707 "load ");
3708 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3709 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3711 return false;
3713 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3717 /* If we stopped analysis at the first dataref we could not analyze
3718 when trying to vectorize a basic-block mark the rest of the datarefs
3719 as not vectorizable and truncate the vector of datarefs. That
3720 avoids spending useless time in analyzing their dependence. */
3721 if (i != datarefs.length ())
3723 gcc_assert (bb_vinfo != NULL);
3724 for (unsigned j = i; j < datarefs.length (); ++j)
3726 data_reference_p dr = datarefs[j];
3727 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3728 free_data_ref (dr);
3730 datarefs.truncate (i);
3733 return true;
3737 /* Function vect_get_new_vect_var.
3739 Returns a name for a new variable. The current naming scheme appends the
3740 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3741 the name of vectorizer generated variables, and appends that to NAME if
3742 provided. */
3744 tree
3745 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3747 const char *prefix;
3748 tree new_vect_var;
3750 switch (var_kind)
3752 case vect_simple_var:
3753 prefix = "vect";
3754 break;
3755 case vect_scalar_var:
3756 prefix = "stmp";
3757 break;
3758 case vect_pointer_var:
3759 prefix = "vectp";
3760 break;
3761 default:
3762 gcc_unreachable ();
3765 if (name)
3767 char* tmp = concat (prefix, "_", name, NULL);
3768 new_vect_var = create_tmp_reg (type, tmp);
3769 free (tmp);
3771 else
3772 new_vect_var = create_tmp_reg (type, prefix);
3774 return new_vect_var;
3778 /* Function vect_create_addr_base_for_vector_ref.
3780 Create an expression that computes the address of the first memory location
3781 that will be accessed for a data reference.
3783 Input:
3784 STMT: The statement containing the data reference.
3785 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3786 OFFSET: Optional. If supplied, it is be added to the initial address.
3787 LOOP: Specify relative to which loop-nest should the address be computed.
3788 For example, when the dataref is in an inner-loop nested in an
3789 outer-loop that is now being vectorized, LOOP can be either the
3790 outer-loop, or the inner-loop. The first memory location accessed
3791 by the following dataref ('in' points to short):
3793 for (i=0; i<N; i++)
3794 for (j=0; j<M; j++)
3795 s += in[i+j]
3797 is as follows:
3798 if LOOP=i_loop: &in (relative to i_loop)
3799 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3801 Output:
3802 1. Return an SSA_NAME whose value is the address of the memory location of
3803 the first vector of the data reference.
3804 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3805 these statement(s) which define the returned SSA_NAME.
3807 FORNOW: We are only handling array accesses with step 1. */
3809 tree
3810 vect_create_addr_base_for_vector_ref (gimple stmt,
3811 gimple_seq *new_stmt_list,
3812 tree offset,
3813 struct loop *loop)
3815 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3816 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3817 tree data_ref_base;
3818 const char *base_name;
3819 tree addr_base;
3820 tree dest;
3821 gimple_seq seq = NULL;
3822 tree base_offset;
3823 tree init;
3824 tree vect_ptr_type;
3825 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3826 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3828 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3830 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3832 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3834 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3835 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3836 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3838 else
3840 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3841 base_offset = unshare_expr (DR_OFFSET (dr));
3842 init = unshare_expr (DR_INIT (dr));
3845 if (loop_vinfo)
3846 base_name = get_name (data_ref_base);
3847 else
3849 base_offset = ssize_int (0);
3850 init = ssize_int (0);
3851 base_name = get_name (DR_REF (dr));
3854 /* Create base_offset */
3855 base_offset = size_binop (PLUS_EXPR,
3856 fold_convert (sizetype, base_offset),
3857 fold_convert (sizetype, init));
3859 if (offset)
3861 offset = fold_build2 (MULT_EXPR, sizetype,
3862 fold_convert (sizetype, offset), step);
3863 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3864 base_offset, offset);
3867 /* base + base_offset */
3868 if (loop_vinfo)
3869 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3870 else
3872 addr_base = build1 (ADDR_EXPR,
3873 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3874 unshare_expr (DR_REF (dr)));
3877 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3878 addr_base = fold_convert (vect_ptr_type, addr_base);
3879 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3880 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3881 gimple_seq_add_seq (new_stmt_list, seq);
3883 if (DR_PTR_INFO (dr)
3884 && TREE_CODE (addr_base) == SSA_NAME)
3886 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3887 if (offset)
3888 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3891 if (dump_enabled_p ())
3893 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3894 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3895 dump_printf (MSG_NOTE, "\n");
3898 return addr_base;
3902 /* Function vect_create_data_ref_ptr.
3904 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3905 location accessed in the loop by STMT, along with the def-use update
3906 chain to appropriately advance the pointer through the loop iterations.
3907 Also set aliasing information for the pointer. This pointer is used by
3908 the callers to this function to create a memory reference expression for
3909 vector load/store access.
3911 Input:
3912 1. STMT: a stmt that references memory. Expected to be of the form
3913 GIMPLE_ASSIGN <name, data-ref> or
3914 GIMPLE_ASSIGN <data-ref, name>.
3915 2. AGGR_TYPE: the type of the reference, which should be either a vector
3916 or an array.
3917 3. AT_LOOP: the loop where the vector memref is to be created.
3918 4. OFFSET (optional): an offset to be added to the initial address accessed
3919 by the data-ref in STMT.
3920 5. BSI: location where the new stmts are to be placed if there is no loop
3921 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3922 pointing to the initial address.
3924 Output:
3925 1. Declare a new ptr to vector_type, and have it point to the base of the
3926 data reference (initial addressed accessed by the data reference).
3927 For example, for vector of type V8HI, the following code is generated:
3929 v8hi *ap;
3930 ap = (v8hi *)initial_address;
3932 if OFFSET is not supplied:
3933 initial_address = &a[init];
3934 if OFFSET is supplied:
3935 initial_address = &a[init + OFFSET];
3937 Return the initial_address in INITIAL_ADDRESS.
3939 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3940 update the pointer in each iteration of the loop.
3942 Return the increment stmt that updates the pointer in PTR_INCR.
3944 3. Set INV_P to true if the access pattern of the data reference in the
3945 vectorized loop is invariant. Set it to false otherwise.
3947 4. Return the pointer. */
3949 tree
3950 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3951 tree offset, tree *initial_address,
3952 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3953 bool only_init, bool *inv_p)
3955 const char *base_name;
3956 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3957 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3958 struct loop *loop = NULL;
3959 bool nested_in_vect_loop = false;
3960 struct loop *containing_loop = NULL;
3961 tree aggr_ptr_type;
3962 tree aggr_ptr;
3963 tree new_temp;
3964 gimple vec_stmt;
3965 gimple_seq new_stmt_list = NULL;
3966 edge pe = NULL;
3967 basic_block new_bb;
3968 tree aggr_ptr_init;
3969 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3970 tree aptr;
3971 gimple_stmt_iterator incr_gsi;
3972 bool insert_after;
3973 tree indx_before_incr, indx_after_incr;
3974 gimple incr;
3975 tree step;
3976 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3978 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3979 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3981 if (loop_vinfo)
3983 loop = LOOP_VINFO_LOOP (loop_vinfo);
3984 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3985 containing_loop = (gimple_bb (stmt))->loop_father;
3986 pe = loop_preheader_edge (loop);
3988 else
3990 gcc_assert (bb_vinfo);
3991 only_init = true;
3992 *ptr_incr = NULL;
3995 /* Check the step (evolution) of the load in LOOP, and record
3996 whether it's invariant. */
3997 if (nested_in_vect_loop)
3998 step = STMT_VINFO_DR_STEP (stmt_info);
3999 else
4000 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4002 if (integer_zerop (step))
4003 *inv_p = true;
4004 else
4005 *inv_p = false;
4007 /* Create an expression for the first address accessed by this load
4008 in LOOP. */
4009 base_name = get_name (DR_BASE_ADDRESS (dr));
4011 if (dump_enabled_p ())
4013 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4014 dump_printf_loc (MSG_NOTE, vect_location,
4015 "create %s-pointer variable to type: ",
4016 get_tree_code_name (TREE_CODE (aggr_type)));
4017 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4018 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4019 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4020 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4021 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4022 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4023 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4024 else
4025 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4026 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4027 dump_printf (MSG_NOTE, "\n");
4030 /* (1) Create the new aggregate-pointer variable.
4031 Vector and array types inherit the alias set of their component
4032 type by default so we need to use a ref-all pointer if the data
4033 reference does not conflict with the created aggregated data
4034 reference because it is not addressable. */
4035 bool need_ref_all = false;
4036 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4037 get_alias_set (DR_REF (dr))))
4038 need_ref_all = true;
4039 /* Likewise for any of the data references in the stmt group. */
4040 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4042 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4045 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4046 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4047 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4048 get_alias_set (DR_REF (sdr))))
4050 need_ref_all = true;
4051 break;
4053 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4055 while (orig_stmt);
4057 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4058 need_ref_all);
4059 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4062 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4063 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4064 def-use update cycles for the pointer: one relative to the outer-loop
4065 (LOOP), which is what steps (3) and (4) below do. The other is relative
4066 to the inner-loop (which is the inner-most loop containing the dataref),
4067 and this is done be step (5) below.
4069 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4070 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4071 redundant. Steps (3),(4) create the following:
4073 vp0 = &base_addr;
4074 LOOP: vp1 = phi(vp0,vp2)
4077 vp2 = vp1 + step
4078 goto LOOP
4080 If there is an inner-loop nested in loop, then step (5) will also be
4081 applied, and an additional update in the inner-loop will be created:
4083 vp0 = &base_addr;
4084 LOOP: vp1 = phi(vp0,vp2)
4086 inner: vp3 = phi(vp1,vp4)
4087 vp4 = vp3 + inner_step
4088 if () goto inner
4090 vp2 = vp1 + step
4091 if () goto LOOP */
4093 /* (2) Calculate the initial address of the aggregate-pointer, and set
4094 the aggregate-pointer to point to it before the loop. */
4096 /* Create: (&(base[init_val+offset]) in the loop preheader. */
4098 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4099 offset, loop);
4100 if (new_stmt_list)
4102 if (pe)
4104 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4105 gcc_assert (!new_bb);
4107 else
4108 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4111 *initial_address = new_temp;
4113 /* Create: p = (aggr_type *) initial_base */
4114 if (TREE_CODE (new_temp) != SSA_NAME
4115 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4117 vec_stmt = gimple_build_assign (aggr_ptr,
4118 fold_convert (aggr_ptr_type, new_temp));
4119 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4120 /* Copy the points-to information if it exists. */
4121 if (DR_PTR_INFO (dr))
4122 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4123 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4124 if (pe)
4126 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4127 gcc_assert (!new_bb);
4129 else
4130 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4132 else
4133 aggr_ptr_init = new_temp;
4135 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4136 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4137 inner-loop nested in LOOP (during outer-loop vectorization). */
4139 /* No update in loop is required. */
4140 if (only_init && (!loop_vinfo || at_loop == loop))
4141 aptr = aggr_ptr_init;
4142 else
4144 /* The step of the aggregate pointer is the type size. */
4145 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4146 /* One exception to the above is when the scalar step of the load in
4147 LOOP is zero. In this case the step here is also zero. */
4148 if (*inv_p)
4149 iv_step = size_zero_node;
4150 else if (tree_int_cst_sgn (step) == -1)
4151 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4153 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4155 create_iv (aggr_ptr_init,
4156 fold_convert (aggr_ptr_type, iv_step),
4157 aggr_ptr, loop, &incr_gsi, insert_after,
4158 &indx_before_incr, &indx_after_incr);
4159 incr = gsi_stmt (incr_gsi);
4160 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4162 /* Copy the points-to information if it exists. */
4163 if (DR_PTR_INFO (dr))
4165 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4166 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4168 if (ptr_incr)
4169 *ptr_incr = incr;
4171 aptr = indx_before_incr;
4174 if (!nested_in_vect_loop || only_init)
4175 return aptr;
4178 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4179 nested in LOOP, if exists. */
4181 gcc_assert (nested_in_vect_loop);
4182 if (!only_init)
4184 standard_iv_increment_position (containing_loop, &incr_gsi,
4185 &insert_after);
4186 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4187 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4188 &indx_after_incr);
4189 incr = gsi_stmt (incr_gsi);
4190 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4192 /* Copy the points-to information if it exists. */
4193 if (DR_PTR_INFO (dr))
4195 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4196 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4198 if (ptr_incr)
4199 *ptr_incr = incr;
4201 return indx_before_incr;
4203 else
4204 gcc_unreachable ();
4208 /* Function bump_vector_ptr
4210 Increment a pointer (to a vector type) by vector-size. If requested,
4211 i.e. if PTR-INCR is given, then also connect the new increment stmt
4212 to the existing def-use update-chain of the pointer, by modifying
4213 the PTR_INCR as illustrated below:
4215 The pointer def-use update-chain before this function:
4216 DATAREF_PTR = phi (p_0, p_2)
4217 ....
4218 PTR_INCR: p_2 = DATAREF_PTR + step
4220 The pointer def-use update-chain after this function:
4221 DATAREF_PTR = phi (p_0, p_2)
4222 ....
4223 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4224 ....
4225 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4227 Input:
4228 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4229 in the loop.
4230 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4231 the loop. The increment amount across iterations is expected
4232 to be vector_size.
4233 BSI - location where the new update stmt is to be placed.
4234 STMT - the original scalar memory-access stmt that is being vectorized.
4235 BUMP - optional. The offset by which to bump the pointer. If not given,
4236 the offset is assumed to be vector_size.
4238 Output: Return NEW_DATAREF_PTR as illustrated above.
4242 tree
4243 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4244 gimple stmt, tree bump)
4246 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4247 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4248 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4249 tree update = TYPE_SIZE_UNIT (vectype);
4250 gimple incr_stmt;
4251 ssa_op_iter iter;
4252 use_operand_p use_p;
4253 tree new_dataref_ptr;
4255 if (bump)
4256 update = bump;
4258 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4259 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4260 dataref_ptr, update);
4261 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4263 /* Copy the points-to information if it exists. */
4264 if (DR_PTR_INFO (dr))
4266 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4267 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4270 if (!ptr_incr)
4271 return new_dataref_ptr;
4273 /* Update the vector-pointer's cross-iteration increment. */
4274 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4276 tree use = USE_FROM_PTR (use_p);
4278 if (use == dataref_ptr)
4279 SET_USE (use_p, new_dataref_ptr);
4280 else
4281 gcc_assert (tree_int_cst_compare (use, update) == 0);
4284 return new_dataref_ptr;
4288 /* Function vect_create_destination_var.
4290 Create a new temporary of type VECTYPE. */
4292 tree
4293 vect_create_destination_var (tree scalar_dest, tree vectype)
4295 tree vec_dest;
4296 const char *name;
4297 char *new_name;
4298 tree type;
4299 enum vect_var_kind kind;
4301 kind = vectype ? vect_simple_var : vect_scalar_var;
4302 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4304 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4306 name = get_name (scalar_dest);
4307 if (name)
4308 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4309 else
4310 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
4311 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4312 free (new_name);
4314 return vec_dest;
4317 /* Function vect_grouped_store_supported.
4319 Returns TRUE if interleave high and interleave low permutations
4320 are supported, and FALSE otherwise. */
4322 bool
4323 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4325 enum machine_mode mode = TYPE_MODE (vectype);
4327 /* vect_permute_store_chain requires the group size to be a power of two. */
4328 if (exact_log2 (count) == -1)
4330 if (dump_enabled_p ())
4331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4332 "the size of the group of accesses"
4333 " is not a power of 2\n");
4334 return false;
4337 /* Check that the permutation is supported. */
4338 if (VECTOR_MODE_P (mode))
4340 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4341 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4342 for (i = 0; i < nelt / 2; i++)
4344 sel[i * 2] = i;
4345 sel[i * 2 + 1] = i + nelt;
4347 if (can_vec_perm_p (mode, false, sel))
4349 for (i = 0; i < nelt; i++)
4350 sel[i] += nelt / 2;
4351 if (can_vec_perm_p (mode, false, sel))
4352 return true;
4356 if (dump_enabled_p ())
4357 dump_printf (MSG_MISSED_OPTIMIZATION,
4358 "interleave op not supported by target.\n");
4359 return false;
4363 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4364 type VECTYPE. */
4366 bool
4367 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4369 return vect_lanes_optab_supported_p ("vec_store_lanes",
4370 vec_store_lanes_optab,
4371 vectype, count);
4375 /* Function vect_permute_store_chain.
4377 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4378 a power of 2, generate interleave_high/low stmts to reorder the data
4379 correctly for the stores. Return the final references for stores in
4380 RESULT_CHAIN.
4382 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4383 The input is 4 vectors each containing 8 elements. We assign a number to
4384 each element, the input sequence is:
4386 1st vec: 0 1 2 3 4 5 6 7
4387 2nd vec: 8 9 10 11 12 13 14 15
4388 3rd vec: 16 17 18 19 20 21 22 23
4389 4th vec: 24 25 26 27 28 29 30 31
4391 The output sequence should be:
4393 1st vec: 0 8 16 24 1 9 17 25
4394 2nd vec: 2 10 18 26 3 11 19 27
4395 3rd vec: 4 12 20 28 5 13 21 30
4396 4th vec: 6 14 22 30 7 15 23 31
4398 i.e., we interleave the contents of the four vectors in their order.
4400 We use interleave_high/low instructions to create such output. The input of
4401 each interleave_high/low operation is two vectors:
4402 1st vec 2nd vec
4403 0 1 2 3 4 5 6 7
4404 the even elements of the result vector are obtained left-to-right from the
4405 high/low elements of the first vector. The odd elements of the result are
4406 obtained left-to-right from the high/low elements of the second vector.
4407 The output of interleave_high will be: 0 4 1 5
4408 and of interleave_low: 2 6 3 7
4411 The permutation is done in log LENGTH stages. In each stage interleave_high
4412 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4413 where the first argument is taken from the first half of DR_CHAIN and the
4414 second argument from it's second half.
4415 In our example,
4417 I1: interleave_high (1st vec, 3rd vec)
4418 I2: interleave_low (1st vec, 3rd vec)
4419 I3: interleave_high (2nd vec, 4th vec)
4420 I4: interleave_low (2nd vec, 4th vec)
4422 The output for the first stage is:
4424 I1: 0 16 1 17 2 18 3 19
4425 I2: 4 20 5 21 6 22 7 23
4426 I3: 8 24 9 25 10 26 11 27
4427 I4: 12 28 13 29 14 30 15 31
4429 The output of the second stage, i.e. the final result is:
4431 I1: 0 8 16 24 1 9 17 25
4432 I2: 2 10 18 26 3 11 19 27
4433 I3: 4 12 20 28 5 13 21 30
4434 I4: 6 14 22 30 7 15 23 31. */
4436 void
4437 vect_permute_store_chain (vec<tree> dr_chain,
4438 unsigned int length,
4439 gimple stmt,
4440 gimple_stmt_iterator *gsi,
4441 vec<tree> *result_chain)
4443 tree vect1, vect2, high, low;
4444 gimple perm_stmt;
4445 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4446 tree perm_mask_low, perm_mask_high;
4447 unsigned int i, n;
4448 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4449 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4451 result_chain->quick_grow (length);
4452 memcpy (result_chain->address (), dr_chain.address (),
4453 length * sizeof (tree));
4455 for (i = 0, n = nelt / 2; i < n; i++)
4457 sel[i * 2] = i;
4458 sel[i * 2 + 1] = i + nelt;
4460 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4461 gcc_assert (perm_mask_high != NULL);
4463 for (i = 0; i < nelt; i++)
4464 sel[i] += nelt / 2;
4465 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4466 gcc_assert (perm_mask_low != NULL);
4468 for (i = 0, n = exact_log2 (length); i < n; i++)
4470 for (j = 0; j < length/2; j++)
4472 vect1 = dr_chain[j];
4473 vect2 = dr_chain[j+length/2];
4475 /* Create interleaving stmt:
4476 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4477 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4478 perm_stmt
4479 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4480 vect1, vect2, perm_mask_high);
4481 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4482 (*result_chain)[2*j] = high;
4484 /* Create interleaving stmt:
4485 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4486 nelt*3/2+1, ...}> */
4487 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4488 perm_stmt
4489 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4490 vect1, vect2, perm_mask_low);
4491 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4492 (*result_chain)[2*j+1] = low;
4494 memcpy (dr_chain.address (), result_chain->address (),
4495 length * sizeof (tree));
4499 /* Function vect_setup_realignment
4501 This function is called when vectorizing an unaligned load using
4502 the dr_explicit_realign[_optimized] scheme.
4503 This function generates the following code at the loop prolog:
4505 p = initial_addr;
4506 x msq_init = *(floor(p)); # prolog load
4507 realignment_token = call target_builtin;
4508 loop:
4509 x msq = phi (msq_init, ---)
4511 The stmts marked with x are generated only for the case of
4512 dr_explicit_realign_optimized.
4514 The code above sets up a new (vector) pointer, pointing to the first
4515 location accessed by STMT, and a "floor-aligned" load using that pointer.
4516 It also generates code to compute the "realignment-token" (if the relevant
4517 target hook was defined), and creates a phi-node at the loop-header bb
4518 whose arguments are the result of the prolog-load (created by this
4519 function) and the result of a load that takes place in the loop (to be
4520 created by the caller to this function).
4522 For the case of dr_explicit_realign_optimized:
4523 The caller to this function uses the phi-result (msq) to create the
4524 realignment code inside the loop, and sets up the missing phi argument,
4525 as follows:
4526 loop:
4527 msq = phi (msq_init, lsq)
4528 lsq = *(floor(p')); # load in loop
4529 result = realign_load (msq, lsq, realignment_token);
4531 For the case of dr_explicit_realign:
4532 loop:
4533 msq = *(floor(p)); # load in loop
4534 p' = p + (VS-1);
4535 lsq = *(floor(p')); # load in loop
4536 result = realign_load (msq, lsq, realignment_token);
4538 Input:
4539 STMT - (scalar) load stmt to be vectorized. This load accesses
4540 a memory location that may be unaligned.
4541 BSI - place where new code is to be inserted.
4542 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4543 is used.
4545 Output:
4546 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4547 target hook, if defined.
4548 Return value - the result of the loop-header phi node. */
4550 tree
4551 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4552 tree *realignment_token,
4553 enum dr_alignment_support alignment_support_scheme,
4554 tree init_addr,
4555 struct loop **at_loop)
4557 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4558 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4559 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4560 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4561 struct loop *loop = NULL;
4562 edge pe = NULL;
4563 tree scalar_dest = gimple_assign_lhs (stmt);
4564 tree vec_dest;
4565 gimple inc;
4566 tree ptr;
4567 tree data_ref;
4568 gimple new_stmt;
4569 basic_block new_bb;
4570 tree msq_init = NULL_TREE;
4571 tree new_temp;
4572 gimple phi_stmt;
4573 tree msq = NULL_TREE;
4574 gimple_seq stmts = NULL;
4575 bool inv_p;
4576 bool compute_in_loop = false;
4577 bool nested_in_vect_loop = false;
4578 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4579 struct loop *loop_for_initial_load = NULL;
4581 if (loop_vinfo)
4583 loop = LOOP_VINFO_LOOP (loop_vinfo);
4584 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4587 gcc_assert (alignment_support_scheme == dr_explicit_realign
4588 || alignment_support_scheme == dr_explicit_realign_optimized);
4590 /* We need to generate three things:
4591 1. the misalignment computation
4592 2. the extra vector load (for the optimized realignment scheme).
4593 3. the phi node for the two vectors from which the realignment is
4594 done (for the optimized realignment scheme). */
4596 /* 1. Determine where to generate the misalignment computation.
4598 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4599 calculation will be generated by this function, outside the loop (in the
4600 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4601 caller, inside the loop.
4603 Background: If the misalignment remains fixed throughout the iterations of
4604 the loop, then both realignment schemes are applicable, and also the
4605 misalignment computation can be done outside LOOP. This is because we are
4606 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4607 are a multiple of VS (the Vector Size), and therefore the misalignment in
4608 different vectorized LOOP iterations is always the same.
4609 The problem arises only if the memory access is in an inner-loop nested
4610 inside LOOP, which is now being vectorized using outer-loop vectorization.
4611 This is the only case when the misalignment of the memory access may not
4612 remain fixed throughout the iterations of the inner-loop (as explained in
4613 detail in vect_supportable_dr_alignment). In this case, not only is the
4614 optimized realignment scheme not applicable, but also the misalignment
4615 computation (and generation of the realignment token that is passed to
4616 REALIGN_LOAD) have to be done inside the loop.
4618 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4619 or not, which in turn determines if the misalignment is computed inside
4620 the inner-loop, or outside LOOP. */
4622 if (init_addr != NULL_TREE || !loop_vinfo)
4624 compute_in_loop = true;
4625 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4629 /* 2. Determine where to generate the extra vector load.
4631 For the optimized realignment scheme, instead of generating two vector
4632 loads in each iteration, we generate a single extra vector load in the
4633 preheader of the loop, and in each iteration reuse the result of the
4634 vector load from the previous iteration. In case the memory access is in
4635 an inner-loop nested inside LOOP, which is now being vectorized using
4636 outer-loop vectorization, we need to determine whether this initial vector
4637 load should be generated at the preheader of the inner-loop, or can be
4638 generated at the preheader of LOOP. If the memory access has no evolution
4639 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4640 to be generated inside LOOP (in the preheader of the inner-loop). */
4642 if (nested_in_vect_loop)
4644 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4645 bool invariant_in_outerloop =
4646 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4647 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4649 else
4650 loop_for_initial_load = loop;
4651 if (at_loop)
4652 *at_loop = loop_for_initial_load;
4654 if (loop_for_initial_load)
4655 pe = loop_preheader_edge (loop_for_initial_load);
4657 /* 3. For the case of the optimized realignment, create the first vector
4658 load at the loop preheader. */
4660 if (alignment_support_scheme == dr_explicit_realign_optimized)
4662 /* Create msq_init = *(floor(p1)) in the loop preheader */
4664 gcc_assert (!compute_in_loop);
4665 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4666 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4667 NULL_TREE, &init_addr, NULL, &inc,
4668 true, &inv_p);
4669 new_temp = copy_ssa_name (ptr, NULL);
4670 new_stmt = gimple_build_assign_with_ops
4671 (BIT_AND_EXPR, new_temp, ptr,
4672 build_int_cst (TREE_TYPE (ptr),
4673 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4674 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4675 gcc_assert (!new_bb);
4676 data_ref
4677 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4678 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4679 new_stmt = gimple_build_assign (vec_dest, data_ref);
4680 new_temp = make_ssa_name (vec_dest, new_stmt);
4681 gimple_assign_set_lhs (new_stmt, new_temp);
4682 if (pe)
4684 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4685 gcc_assert (!new_bb);
4687 else
4688 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4690 msq_init = gimple_assign_lhs (new_stmt);
4693 /* 4. Create realignment token using a target builtin, if available.
4694 It is done either inside the containing loop, or before LOOP (as
4695 determined above). */
4697 if (targetm.vectorize.builtin_mask_for_load)
4699 tree builtin_decl;
4701 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4702 if (!init_addr)
4704 /* Generate the INIT_ADDR computation outside LOOP. */
4705 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4706 NULL_TREE, loop);
4707 if (loop)
4709 pe = loop_preheader_edge (loop);
4710 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4711 gcc_assert (!new_bb);
4713 else
4714 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4717 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4718 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4719 vec_dest =
4720 vect_create_destination_var (scalar_dest,
4721 gimple_call_return_type (new_stmt));
4722 new_temp = make_ssa_name (vec_dest, new_stmt);
4723 gimple_call_set_lhs (new_stmt, new_temp);
4725 if (compute_in_loop)
4726 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4727 else
4729 /* Generate the misalignment computation outside LOOP. */
4730 pe = loop_preheader_edge (loop);
4731 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4732 gcc_assert (!new_bb);
4735 *realignment_token = gimple_call_lhs (new_stmt);
4737 /* The result of the CALL_EXPR to this builtin is determined from
4738 the value of the parameter and no global variables are touched
4739 which makes the builtin a "const" function. Requiring the
4740 builtin to have the "const" attribute makes it unnecessary
4741 to call mark_call_clobbered. */
4742 gcc_assert (TREE_READONLY (builtin_decl));
4745 if (alignment_support_scheme == dr_explicit_realign)
4746 return msq;
4748 gcc_assert (!compute_in_loop);
4749 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4752 /* 5. Create msq = phi <msq_init, lsq> in loop */
4754 pe = loop_preheader_edge (containing_loop);
4755 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4756 msq = make_ssa_name (vec_dest, NULL);
4757 phi_stmt = create_phi_node (msq, containing_loop->header);
4758 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4760 return msq;
4764 /* Function vect_grouped_load_supported.
4766 Returns TRUE if even and odd permutations are supported,
4767 and FALSE otherwise. */
4769 bool
4770 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4772 enum machine_mode mode = TYPE_MODE (vectype);
4774 /* vect_permute_load_chain requires the group size to be a power of two. */
4775 if (exact_log2 (count) == -1)
4777 if (dump_enabled_p ())
4778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4779 "the size of the group of accesses"
4780 " is not a power of 2\n");
4781 return false;
4784 /* Check that the permutation is supported. */
4785 if (VECTOR_MODE_P (mode))
4787 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4788 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4790 for (i = 0; i < nelt; i++)
4791 sel[i] = i * 2;
4792 if (can_vec_perm_p (mode, false, sel))
4794 for (i = 0; i < nelt; i++)
4795 sel[i] = i * 2 + 1;
4796 if (can_vec_perm_p (mode, false, sel))
4797 return true;
4801 if (dump_enabled_p ())
4802 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4803 "extract even/odd not supported by target\n");
4804 return false;
4807 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4808 type VECTYPE. */
4810 bool
4811 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4813 return vect_lanes_optab_supported_p ("vec_load_lanes",
4814 vec_load_lanes_optab,
4815 vectype, count);
4818 /* Function vect_permute_load_chain.
4820 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4821 a power of 2, generate extract_even/odd stmts to reorder the input data
4822 correctly. Return the final references for loads in RESULT_CHAIN.
4824 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4825 The input is 4 vectors each containing 8 elements. We assign a number to each
4826 element, the input sequence is:
4828 1st vec: 0 1 2 3 4 5 6 7
4829 2nd vec: 8 9 10 11 12 13 14 15
4830 3rd vec: 16 17 18 19 20 21 22 23
4831 4th vec: 24 25 26 27 28 29 30 31
4833 The output sequence should be:
4835 1st vec: 0 4 8 12 16 20 24 28
4836 2nd vec: 1 5 9 13 17 21 25 29
4837 3rd vec: 2 6 10 14 18 22 26 30
4838 4th vec: 3 7 11 15 19 23 27 31
4840 i.e., the first output vector should contain the first elements of each
4841 interleaving group, etc.
4843 We use extract_even/odd instructions to create such output. The input of
4844 each extract_even/odd operation is two vectors
4845 1st vec 2nd vec
4846 0 1 2 3 4 5 6 7
4848 and the output is the vector of extracted even/odd elements. The output of
4849 extract_even will be: 0 2 4 6
4850 and of extract_odd: 1 3 5 7
4853 The permutation is done in log LENGTH stages. In each stage extract_even
4854 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4855 their order. In our example,
4857 E1: extract_even (1st vec, 2nd vec)
4858 E2: extract_odd (1st vec, 2nd vec)
4859 E3: extract_even (3rd vec, 4th vec)
4860 E4: extract_odd (3rd vec, 4th vec)
4862 The output for the first stage will be:
4864 E1: 0 2 4 6 8 10 12 14
4865 E2: 1 3 5 7 9 11 13 15
4866 E3: 16 18 20 22 24 26 28 30
4867 E4: 17 19 21 23 25 27 29 31
4869 In order to proceed and create the correct sequence for the next stage (or
4870 for the correct output, if the second stage is the last one, as in our
4871 example), we first put the output of extract_even operation and then the
4872 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4873 The input for the second stage is:
4875 1st vec (E1): 0 2 4 6 8 10 12 14
4876 2nd vec (E3): 16 18 20 22 24 26 28 30
4877 3rd vec (E2): 1 3 5 7 9 11 13 15
4878 4th vec (E4): 17 19 21 23 25 27 29 31
4880 The output of the second stage:
4882 E1: 0 4 8 12 16 20 24 28
4883 E2: 2 6 10 14 18 22 26 30
4884 E3: 1 5 9 13 17 21 25 29
4885 E4: 3 7 11 15 19 23 27 31
4887 And RESULT_CHAIN after reordering:
4889 1st vec (E1): 0 4 8 12 16 20 24 28
4890 2nd vec (E3): 1 5 9 13 17 21 25 29
4891 3rd vec (E2): 2 6 10 14 18 22 26 30
4892 4th vec (E4): 3 7 11 15 19 23 27 31. */
4894 static void
4895 vect_permute_load_chain (vec<tree> dr_chain,
4896 unsigned int length,
4897 gimple stmt,
4898 gimple_stmt_iterator *gsi,
4899 vec<tree> *result_chain)
4901 tree data_ref, first_vect, second_vect;
4902 tree perm_mask_even, perm_mask_odd;
4903 gimple perm_stmt;
4904 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4905 unsigned int i, j, log_length = exact_log2 (length);
4906 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4907 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4909 result_chain->quick_grow (length);
4910 memcpy (result_chain->address (), dr_chain.address (),
4911 length * sizeof (tree));
4913 for (i = 0; i < nelt; ++i)
4914 sel[i] = i * 2;
4915 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4916 gcc_assert (perm_mask_even != NULL);
4918 for (i = 0; i < nelt; ++i)
4919 sel[i] = i * 2 + 1;
4920 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4921 gcc_assert (perm_mask_odd != NULL);
4923 for (i = 0; i < log_length; i++)
4925 for (j = 0; j < length; j += 2)
4927 first_vect = dr_chain[j];
4928 second_vect = dr_chain[j+1];
4930 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4931 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4932 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4933 first_vect, second_vect,
4934 perm_mask_even);
4935 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4936 (*result_chain)[j/2] = data_ref;
4938 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4939 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4940 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4941 first_vect, second_vect,
4942 perm_mask_odd);
4943 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4944 (*result_chain)[j/2+length/2] = data_ref;
4946 memcpy (dr_chain.address (), result_chain->address (),
4947 length * sizeof (tree));
4952 /* Function vect_transform_grouped_load.
4954 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4955 to perform their permutation and ascribe the result vectorized statements to
4956 the scalar statements.
4959 void
4960 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4961 gimple_stmt_iterator *gsi)
4963 vec<tree> result_chain = vNULL;
4965 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4966 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4967 vectors, that are ready for vector computation. */
4968 result_chain.create (size);
4969 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4970 vect_record_grouped_load_vectors (stmt, result_chain);
4971 result_chain.release ();
4974 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4975 generated as part of the vectorization of STMT. Assign the statement
4976 for each vector to the associated scalar statement. */
4978 void
4979 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4981 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4982 gimple next_stmt, new_stmt;
4983 unsigned int i, gap_count;
4984 tree tmp_data_ref;
4986 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4987 Since we scan the chain starting from it's first node, their order
4988 corresponds the order of data-refs in RESULT_CHAIN. */
4989 next_stmt = first_stmt;
4990 gap_count = 1;
4991 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
4993 if (!next_stmt)
4994 break;
4996 /* Skip the gaps. Loads created for the gaps will be removed by dead
4997 code elimination pass later. No need to check for the first stmt in
4998 the group, since it always exists.
4999 GROUP_GAP is the number of steps in elements from the previous
5000 access (if there is no gap GROUP_GAP is 1). We skip loads that
5001 correspond to the gaps. */
5002 if (next_stmt != first_stmt
5003 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5005 gap_count++;
5006 continue;
5009 while (next_stmt)
5011 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5012 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5013 copies, and we put the new vector statement in the first available
5014 RELATED_STMT. */
5015 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5016 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5017 else
5019 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5021 gimple prev_stmt =
5022 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5023 gimple rel_stmt =
5024 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5025 while (rel_stmt)
5027 prev_stmt = rel_stmt;
5028 rel_stmt =
5029 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5032 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5033 new_stmt;
5037 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5038 gap_count = 1;
5039 /* If NEXT_STMT accesses the same DR as the previous statement,
5040 put the same TMP_DATA_REF as its vectorized statement; otherwise
5041 get the next data-ref from RESULT_CHAIN. */
5042 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5043 break;
5048 /* Function vect_force_dr_alignment_p.
5050 Returns whether the alignment of a DECL can be forced to be aligned
5051 on ALIGNMENT bit boundary. */
5053 bool
5054 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5056 if (TREE_CODE (decl) != VAR_DECL)
5057 return false;
5059 /* We cannot change alignment of common or external symbols as another
5060 translation unit may contain a definition with lower alignment.
5061 The rules of common symbol linking mean that the definition
5062 will override the common symbol. The same is true for constant
5063 pool entries which may be shared and are not properly merged
5064 by LTO. */
5065 if (DECL_EXTERNAL (decl)
5066 || DECL_COMMON (decl)
5067 || DECL_IN_CONSTANT_POOL (decl))
5068 return false;
5070 if (TREE_ASM_WRITTEN (decl))
5071 return false;
5073 /* Do not override the alignment as specified by the ABI when the used
5074 attribute is set. */
5075 if (DECL_PRESERVE_P (decl))
5076 return false;
5078 /* Do not override explicit alignment set by the user when an explicit
5079 section name is also used. This is a common idiom used by many
5080 software projects. */
5081 if (DECL_SECTION_NAME (decl) != NULL_TREE
5082 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
5083 return false;
5085 if (TREE_STATIC (decl))
5086 return (alignment <= MAX_OFILE_ALIGNMENT);
5087 else
5088 return (alignment <= MAX_STACK_ALIGNMENT);
5092 /* Return whether the data reference DR is supported with respect to its
5093 alignment.
5094 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5095 it is aligned, i.e., check if it is possible to vectorize it with different
5096 alignment. */
5098 enum dr_alignment_support
5099 vect_supportable_dr_alignment (struct data_reference *dr,
5100 bool check_aligned_accesses)
5102 gimple stmt = DR_STMT (dr);
5103 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5104 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5105 enum machine_mode mode = TYPE_MODE (vectype);
5106 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5107 struct loop *vect_loop = NULL;
5108 bool nested_in_vect_loop = false;
5110 if (aligned_access_p (dr) && !check_aligned_accesses)
5111 return dr_aligned;
5113 /* For now assume all conditional loads/stores support unaligned
5114 access without any special code. */
5115 if (is_gimple_call (stmt)
5116 && gimple_call_internal_p (stmt)
5117 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5118 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5119 return dr_unaligned_supported;
5121 if (loop_vinfo)
5123 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5124 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5127 /* Possibly unaligned access. */
5129 /* We can choose between using the implicit realignment scheme (generating
5130 a misaligned_move stmt) and the explicit realignment scheme (generating
5131 aligned loads with a REALIGN_LOAD). There are two variants to the
5132 explicit realignment scheme: optimized, and unoptimized.
5133 We can optimize the realignment only if the step between consecutive
5134 vector loads is equal to the vector size. Since the vector memory
5135 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5136 is guaranteed that the misalignment amount remains the same throughout the
5137 execution of the vectorized loop. Therefore, we can create the
5138 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5139 at the loop preheader.
5141 However, in the case of outer-loop vectorization, when vectorizing a
5142 memory access in the inner-loop nested within the LOOP that is now being
5143 vectorized, while it is guaranteed that the misalignment of the
5144 vectorized memory access will remain the same in different outer-loop
5145 iterations, it is *not* guaranteed that is will remain the same throughout
5146 the execution of the inner-loop. This is because the inner-loop advances
5147 with the original scalar step (and not in steps of VS). If the inner-loop
5148 step happens to be a multiple of VS, then the misalignment remains fixed
5149 and we can use the optimized realignment scheme. For example:
5151 for (i=0; i<N; i++)
5152 for (j=0; j<M; j++)
5153 s += a[i+j];
5155 When vectorizing the i-loop in the above example, the step between
5156 consecutive vector loads is 1, and so the misalignment does not remain
5157 fixed across the execution of the inner-loop, and the realignment cannot
5158 be optimized (as illustrated in the following pseudo vectorized loop):
5160 for (i=0; i<N; i+=4)
5161 for (j=0; j<M; j++){
5162 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5163 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5164 // (assuming that we start from an aligned address).
5167 We therefore have to use the unoptimized realignment scheme:
5169 for (i=0; i<N; i+=4)
5170 for (j=k; j<M; j+=4)
5171 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5172 // that the misalignment of the initial address is
5173 // 0).
5175 The loop can then be vectorized as follows:
5177 for (k=0; k<4; k++){
5178 rt = get_realignment_token (&vp[k]);
5179 for (i=0; i<N; i+=4){
5180 v1 = vp[i+k];
5181 for (j=k; j<M; j+=4){
5182 v2 = vp[i+j+VS-1];
5183 va = REALIGN_LOAD <v1,v2,rt>;
5184 vs += va;
5185 v1 = v2;
5188 } */
5190 if (DR_IS_READ (dr))
5192 bool is_packed = false;
5193 tree type = (TREE_TYPE (DR_REF (dr)));
5195 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5196 && (!targetm.vectorize.builtin_mask_for_load
5197 || targetm.vectorize.builtin_mask_for_load ()))
5199 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5200 if ((nested_in_vect_loop
5201 && (TREE_INT_CST_LOW (DR_STEP (dr))
5202 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5203 || !loop_vinfo)
5204 return dr_explicit_realign;
5205 else
5206 return dr_explicit_realign_optimized;
5208 if (!known_alignment_for_access_p (dr))
5209 is_packed = not_size_aligned (DR_REF (dr));
5211 if ((TYPE_USER_ALIGN (type) && !is_packed)
5212 || targetm.vectorize.
5213 support_vector_misalignment (mode, type,
5214 DR_MISALIGNMENT (dr), is_packed))
5215 /* Can't software pipeline the loads, but can at least do them. */
5216 return dr_unaligned_supported;
5218 else
5220 bool is_packed = false;
5221 tree type = (TREE_TYPE (DR_REF (dr)));
5223 if (!known_alignment_for_access_p (dr))
5224 is_packed = not_size_aligned (DR_REF (dr));
5226 if ((TYPE_USER_ALIGN (type) && !is_packed)
5227 || targetm.vectorize.
5228 support_vector_misalignment (mode, type,
5229 DR_MISALIGNMENT (dr), is_packed))
5230 return dr_unaligned_supported;
5233 /* Unsupported. */
5234 return dr_unaligned_unsupported;