[AArch64] fix big.LITTLE spec rewriting
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob0deac8177fd141039dede98146f7ce1b1705f0bb
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2014 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "stor-layout.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-ssa-alias.h"
34 #include "internal-fn.h"
35 #include "tree-eh.h"
36 #include "gimple-expr.h"
37 #include "is-a.h"
38 #include "gimple.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "gimple-ssa.h"
43 #include "tree-phinodes.h"
44 #include "ssa-iterators.h"
45 #include "stringpool.h"
46 #include "tree-ssanames.h"
47 #include "tree-ssa-loop-ivopts.h"
48 #include "tree-ssa-loop-manip.h"
49 #include "tree-ssa-loop.h"
50 #include "dumpfile.h"
51 #include "cfgloop.h"
52 #include "tree-chrec.h"
53 #include "tree-scalar-evolution.h"
54 #include "tree-vectorizer.h"
55 #include "diagnostic-core.h"
56 #include "cgraph.h"
57 /* Need to include rtl.h, expr.h, etc. for optabs. */
58 #include "expr.h"
59 #include "optabs.h"
61 /* Return true if load- or store-lanes optab OPTAB is implemented for
62 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
64 static bool
65 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
66 tree vectype, unsigned HOST_WIDE_INT count)
68 enum machine_mode mode, array_mode;
69 bool limit_p;
71 mode = TYPE_MODE (vectype);
72 limit_p = !targetm.array_mode_supported_p (mode, count);
73 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
74 MODE_INT, limit_p);
76 if (array_mode == BLKmode)
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
80 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
81 GET_MODE_NAME (mode), count);
82 return false;
85 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
89 "cannot use %s<%s><%s>\n", name,
90 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
91 return false;
94 if (dump_enabled_p ())
95 dump_printf_loc (MSG_NOTE, vect_location,
96 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
97 GET_MODE_NAME (mode));
99 return true;
103 /* Return the smallest scalar part of STMT.
104 This is used to determine the vectype of the stmt. We generally set the
105 vectype according to the type of the result (lhs). For stmts whose
106 result-type is different than the type of the arguments (e.g., demotion,
107 promotion), vectype will be reset appropriately (later). Note that we have
108 to visit the smallest datatype in this function, because that determines the
109 VF. If the smallest datatype in the loop is present only as the rhs of a
110 promotion operation - we'd miss it.
111 Such a case, where a variable of this datatype does not appear in the lhs
112 anywhere in the loop, can only occur if it's an invariant: e.g.:
113 'int_x = (int) short_inv', which we'd expect to have been optimized away by
114 invariant motion. However, we cannot rely on invariant motion to always
115 take invariants out of the loop, and so in the case of promotion we also
116 have to check the rhs.
117 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
118 types. */
120 tree
121 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
122 HOST_WIDE_INT *rhs_size_unit)
124 tree scalar_type = gimple_expr_type (stmt);
125 HOST_WIDE_INT lhs, rhs;
127 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
129 if (is_gimple_assign (stmt)
130 && (gimple_assign_cast_p (stmt)
131 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
132 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
133 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
135 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
137 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
138 if (rhs < lhs)
139 scalar_type = rhs_type;
142 *lhs_size_unit = lhs;
143 *rhs_size_unit = rhs;
144 return scalar_type;
148 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
149 tested at run-time. Return TRUE if DDR was successfully inserted.
150 Return false if versioning is not supported. */
152 static bool
153 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
155 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
157 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
158 return false;
160 if (dump_enabled_p ())
162 dump_printf_loc (MSG_NOTE, vect_location,
163 "mark for run-time aliasing test between ");
164 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
165 dump_printf (MSG_NOTE, " and ");
166 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
167 dump_printf (MSG_NOTE, "\n");
170 if (optimize_loop_nest_for_size_p (loop))
172 if (dump_enabled_p ())
173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
174 "versioning not supported when optimizing"
175 " for size.\n");
176 return false;
179 /* FORNOW: We don't support versioning with outer-loop vectorization. */
180 if (loop->inner)
182 if (dump_enabled_p ())
183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
184 "versioning not yet supported for outer-loops.\n");
185 return false;
188 /* FORNOW: We don't support creating runtime alias tests for non-constant
189 step. */
190 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
191 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
193 if (dump_enabled_p ())
194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
195 "versioning not yet supported for non-constant "
196 "step\n");
197 return false;
200 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
201 return true;
205 /* Function vect_analyze_data_ref_dependence.
207 Return TRUE if there (might) exist a dependence between a memory-reference
208 DRA and a memory-reference DRB. When versioning for alias may check a
209 dependence at run-time, return FALSE. Adjust *MAX_VF according to
210 the data dependence. */
212 static bool
213 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
214 loop_vec_info loop_vinfo, int *max_vf)
216 unsigned int i;
217 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
218 struct data_reference *dra = DDR_A (ddr);
219 struct data_reference *drb = DDR_B (ddr);
220 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
221 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
222 lambda_vector dist_v;
223 unsigned int loop_depth;
225 /* In loop analysis all data references should be vectorizable. */
226 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
227 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
228 gcc_unreachable ();
230 /* Independent data accesses. */
231 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
232 return false;
234 if (dra == drb
235 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
236 return false;
238 /* Unknown data dependence. */
239 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
241 /* If user asserted safelen consecutive iterations can be
242 executed concurrently, assume independence. */
243 if (loop->safelen >= 2)
245 if (loop->safelen < *max_vf)
246 *max_vf = loop->safelen;
247 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
248 return false;
251 if (STMT_VINFO_GATHER_P (stmtinfo_a)
252 || STMT_VINFO_GATHER_P (stmtinfo_b))
254 if (dump_enabled_p ())
256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
257 "versioning for alias not supported for: "
258 "can't determine dependence between ");
259 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
260 DR_REF (dra));
261 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
262 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
263 DR_REF (drb));
264 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
266 return true;
269 if (dump_enabled_p ())
271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
272 "versioning for alias required: "
273 "can't determine dependence between ");
274 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
275 DR_REF (dra));
276 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
277 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
278 DR_REF (drb));
279 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
282 /* Add to list of ddrs that need to be tested at run-time. */
283 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
286 /* Known data dependence. */
287 if (DDR_NUM_DIST_VECTS (ddr) == 0)
289 /* If user asserted safelen consecutive iterations can be
290 executed concurrently, assume independence. */
291 if (loop->safelen >= 2)
293 if (loop->safelen < *max_vf)
294 *max_vf = loop->safelen;
295 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
296 return false;
299 if (STMT_VINFO_GATHER_P (stmtinfo_a)
300 || STMT_VINFO_GATHER_P (stmtinfo_b))
302 if (dump_enabled_p ())
304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
305 "versioning for alias not supported for: "
306 "bad dist vector for ");
307 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
308 DR_REF (dra));
309 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
310 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
311 DR_REF (drb));
312 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
314 return true;
317 if (dump_enabled_p ())
319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
320 "versioning for alias required: "
321 "bad dist vector for ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
323 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
324 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
325 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
327 /* Add to list of ddrs that need to be tested at run-time. */
328 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
331 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
332 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
334 int dist = dist_v[loop_depth];
336 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location,
338 "dependence distance = %d.\n", dist);
340 if (dist == 0)
342 if (dump_enabled_p ())
344 dump_printf_loc (MSG_NOTE, vect_location,
345 "dependence distance == 0 between ");
346 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
347 dump_printf (MSG_NOTE, " and ");
348 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
349 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
352 /* When we perform grouped accesses and perform implicit CSE
353 by detecting equal accesses and doing disambiguation with
354 runtime alias tests like for
355 .. = a[i];
356 .. = a[i+1];
357 a[i] = ..;
358 a[i+1] = ..;
359 *p = ..;
360 .. = a[i];
361 .. = a[i+1];
362 where we will end up loading { a[i], a[i+1] } once, make
363 sure that inserting group loads before the first load and
364 stores after the last store will do the right thing. */
365 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
366 && GROUP_SAME_DR_STMT (stmtinfo_a))
367 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
368 && GROUP_SAME_DR_STMT (stmtinfo_b)))
370 gimple earlier_stmt;
371 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
372 if (DR_IS_WRITE
373 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
375 if (dump_enabled_p ())
376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
377 "READ_WRITE dependence in interleaving."
378 "\n");
379 return true;
383 continue;
386 if (dist > 0 && DDR_REVERSED_P (ddr))
388 /* If DDR_REVERSED_P the order of the data-refs in DDR was
389 reversed (to make distance vector positive), and the actual
390 distance is negative. */
391 if (dump_enabled_p ())
392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
393 "dependence distance negative.\n");
394 continue;
397 if (abs (dist) >= 2
398 && abs (dist) < *max_vf)
400 /* The dependence distance requires reduction of the maximal
401 vectorization factor. */
402 *max_vf = abs (dist);
403 if (dump_enabled_p ())
404 dump_printf_loc (MSG_NOTE, vect_location,
405 "adjusting maximal vectorization factor to %i\n",
406 *max_vf);
409 if (abs (dist) >= *max_vf)
411 /* Dependence distance does not create dependence, as far as
412 vectorization is concerned, in this case. */
413 if (dump_enabled_p ())
414 dump_printf_loc (MSG_NOTE, vect_location,
415 "dependence distance >= VF.\n");
416 continue;
419 if (dump_enabled_p ())
421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
422 "not vectorized, possible dependence "
423 "between data-refs ");
424 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
425 dump_printf (MSG_NOTE, " and ");
426 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
427 dump_printf (MSG_NOTE, "\n");
430 return true;
433 return false;
436 /* Function vect_analyze_data_ref_dependences.
438 Examine all the data references in the loop, and make sure there do not
439 exist any data dependences between them. Set *MAX_VF according to
440 the maximum vectorization factor the data dependences allow. */
442 bool
443 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
445 unsigned int i;
446 struct data_dependence_relation *ddr;
448 if (dump_enabled_p ())
449 dump_printf_loc (MSG_NOTE, vect_location,
450 "=== vect_analyze_data_ref_dependences ===\n");
452 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
453 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
454 &LOOP_VINFO_DDRS (loop_vinfo),
455 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
456 return false;
458 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
459 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
460 return false;
462 return true;
466 /* Function vect_slp_analyze_data_ref_dependence.
468 Return TRUE if there (might) exist a dependence between a memory-reference
469 DRA and a memory-reference DRB. When versioning for alias may check a
470 dependence at run-time, return FALSE. Adjust *MAX_VF according to
471 the data dependence. */
473 static bool
474 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
476 struct data_reference *dra = DDR_A (ddr);
477 struct data_reference *drb = DDR_B (ddr);
479 /* We need to check dependences of statements marked as unvectorizable
480 as well, they still can prohibit vectorization. */
482 /* Independent data accesses. */
483 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
484 return false;
486 if (dra == drb)
487 return false;
489 /* Read-read is OK. */
490 if (DR_IS_READ (dra) && DR_IS_READ (drb))
491 return false;
493 /* If dra and drb are part of the same interleaving chain consider
494 them independent. */
495 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
496 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
497 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
498 return false;
500 /* Unknown data dependence. */
501 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
503 if (dump_enabled_p ())
505 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
506 "can't determine dependence between ");
507 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
508 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
509 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
510 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
513 else if (dump_enabled_p ())
515 dump_printf_loc (MSG_NOTE, vect_location,
516 "determined dependence between ");
517 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
518 dump_printf (MSG_NOTE, " and ");
519 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
520 dump_printf (MSG_NOTE, "\n");
523 /* We do not vectorize basic blocks with write-write dependencies. */
524 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
525 return true;
527 /* If we have a read-write dependence check that the load is before the store.
528 When we vectorize basic blocks, vector load can be only before
529 corresponding scalar load, and vector store can be only after its
530 corresponding scalar store. So the order of the acceses is preserved in
531 case the load is before the store. */
532 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
533 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
535 /* That only holds for load-store pairs taking part in vectorization. */
536 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
537 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
538 return false;
541 return true;
545 /* Function vect_analyze_data_ref_dependences.
547 Examine all the data references in the basic-block, and make sure there
548 do not exist any data dependences between them. Set *MAX_VF according to
549 the maximum vectorization factor the data dependences allow. */
551 bool
552 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
554 struct data_dependence_relation *ddr;
555 unsigned int i;
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE, vect_location,
559 "=== vect_slp_analyze_data_ref_dependences ===\n");
561 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
562 &BB_VINFO_DDRS (bb_vinfo),
563 vNULL, true))
564 return false;
566 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
567 if (vect_slp_analyze_data_ref_dependence (ddr))
568 return false;
570 return true;
574 /* Function vect_compute_data_ref_alignment
576 Compute the misalignment of the data reference DR.
578 Output:
579 1. If during the misalignment computation it is found that the data reference
580 cannot be vectorized then false is returned.
581 2. DR_MISALIGNMENT (DR) is defined.
583 FOR NOW: No analysis is actually performed. Misalignment is calculated
584 only for trivial cases. TODO. */
586 static bool
587 vect_compute_data_ref_alignment (struct data_reference *dr)
589 gimple stmt = DR_STMT (dr);
590 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
591 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
592 struct loop *loop = NULL;
593 tree ref = DR_REF (dr);
594 tree vectype;
595 tree base, base_addr;
596 bool base_aligned;
597 tree misalign;
598 tree aligned_to, alignment;
600 if (dump_enabled_p ())
601 dump_printf_loc (MSG_NOTE, vect_location,
602 "vect_compute_data_ref_alignment:\n");
604 if (loop_vinfo)
605 loop = LOOP_VINFO_LOOP (loop_vinfo);
607 /* Initialize misalignment to unknown. */
608 SET_DR_MISALIGNMENT (dr, -1);
610 /* Strided loads perform only component accesses, misalignment information
611 is irrelevant for them. */
612 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
613 return true;
615 misalign = DR_INIT (dr);
616 aligned_to = DR_ALIGNED_TO (dr);
617 base_addr = DR_BASE_ADDRESS (dr);
618 vectype = STMT_VINFO_VECTYPE (stmt_info);
620 /* In case the dataref is in an inner-loop of the loop that is being
621 vectorized (LOOP), we use the base and misalignment information
622 relative to the outer-loop (LOOP). This is ok only if the misalignment
623 stays the same throughout the execution of the inner-loop, which is why
624 we have to check that the stride of the dataref in the inner-loop evenly
625 divides by the vector size. */
626 if (loop && nested_in_vect_loop_p (loop, stmt))
628 tree step = DR_STEP (dr);
629 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
631 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
633 if (dump_enabled_p ())
634 dump_printf_loc (MSG_NOTE, vect_location,
635 "inner step divides the vector-size.\n");
636 misalign = STMT_VINFO_DR_INIT (stmt_info);
637 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
638 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
640 else
642 if (dump_enabled_p ())
643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
644 "inner step doesn't divide the vector-size.\n");
645 misalign = NULL_TREE;
649 /* Similarly, if we're doing basic-block vectorization, we can only use
650 base and misalignment information relative to an innermost loop if the
651 misalignment stays the same throughout the execution of the loop.
652 As above, this is the case if the stride of the dataref evenly divides
653 by the vector size. */
654 if (!loop)
656 tree step = DR_STEP (dr);
657 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
659 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
661 if (dump_enabled_p ())
662 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
663 "SLP: step doesn't divide the vector-size.\n");
664 misalign = NULL_TREE;
668 base = build_fold_indirect_ref (base_addr);
669 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
671 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
672 || !misalign)
674 if (dump_enabled_p ())
676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
677 "Unknown alignment for access: ");
678 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
679 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
681 return true;
684 if ((DECL_P (base)
685 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
686 alignment) >= 0)
687 || (TREE_CODE (base_addr) == SSA_NAME
688 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
689 TREE_TYPE (base_addr)))),
690 alignment) >= 0)
691 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
692 base_aligned = true;
693 else
694 base_aligned = false;
696 if (!base_aligned)
698 /* Do not change the alignment of global variables here if
699 flag_section_anchors is enabled as we already generated
700 RTL for other functions. Most global variables should
701 have been aligned during the IPA increase_alignment pass. */
702 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
703 || (TREE_STATIC (base) && flag_section_anchors))
705 if (dump_enabled_p ())
707 dump_printf_loc (MSG_NOTE, vect_location,
708 "can't force alignment of ref: ");
709 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
710 dump_printf (MSG_NOTE, "\n");
712 return true;
715 /* Force the alignment of the decl.
716 NOTE: This is the only change to the code we make during
717 the analysis phase, before deciding to vectorize the loop. */
718 if (dump_enabled_p ())
720 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
721 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
722 dump_printf (MSG_NOTE, "\n");
725 ((dataref_aux *)dr->aux)->base_decl = base;
726 ((dataref_aux *)dr->aux)->base_misaligned = true;
729 /* If this is a backward running DR then first access in the larger
730 vectype actually is N-1 elements before the address in the DR.
731 Adjust misalign accordingly. */
732 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
734 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
735 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
736 otherwise we wouldn't be here. */
737 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
738 /* PLUS because DR_STEP was negative. */
739 misalign = size_binop (PLUS_EXPR, misalign, offset);
742 /* Modulo alignment. */
743 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
745 if (!tree_fits_uhwi_p (misalign))
747 /* Negative or overflowed misalignment value. */
748 if (dump_enabled_p ())
749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
750 "unexpected misalign value\n");
751 return false;
754 SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
756 if (dump_enabled_p ())
758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
759 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
760 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
761 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
764 return true;
768 /* Function vect_compute_data_refs_alignment
770 Compute the misalignment of data references in the loop.
771 Return FALSE if a data reference is found that cannot be vectorized. */
773 static bool
774 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
775 bb_vec_info bb_vinfo)
777 vec<data_reference_p> datarefs;
778 struct data_reference *dr;
779 unsigned int i;
781 if (loop_vinfo)
782 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
783 else
784 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
786 FOR_EACH_VEC_ELT (datarefs, i, dr)
787 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
788 && !vect_compute_data_ref_alignment (dr))
790 if (bb_vinfo)
792 /* Mark unsupported statement as unvectorizable. */
793 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
794 continue;
796 else
797 return false;
800 return true;
804 /* Function vect_update_misalignment_for_peel
806 DR - the data reference whose misalignment is to be adjusted.
807 DR_PEEL - the data reference whose misalignment is being made
808 zero in the vector loop by the peel.
809 NPEEL - the number of iterations in the peel loop if the misalignment
810 of DR_PEEL is known at compile time. */
812 static void
813 vect_update_misalignment_for_peel (struct data_reference *dr,
814 struct data_reference *dr_peel, int npeel)
816 unsigned int i;
817 vec<dr_p> same_align_drs;
818 struct data_reference *current_dr;
819 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
820 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
821 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
822 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
824 /* For interleaved data accesses the step in the loop must be multiplied by
825 the size of the interleaving group. */
826 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
827 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
828 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
829 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
831 /* It can be assumed that the data refs with the same alignment as dr_peel
832 are aligned in the vector loop. */
833 same_align_drs
834 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
835 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
837 if (current_dr != dr)
838 continue;
839 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
840 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
841 SET_DR_MISALIGNMENT (dr, 0);
842 return;
845 if (known_alignment_for_access_p (dr)
846 && known_alignment_for_access_p (dr_peel))
848 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
849 int misal = DR_MISALIGNMENT (dr);
850 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
851 misal += negative ? -npeel * dr_size : npeel * dr_size;
852 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
853 SET_DR_MISALIGNMENT (dr, misal);
854 return;
857 if (dump_enabled_p ())
858 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
859 SET_DR_MISALIGNMENT (dr, -1);
863 /* Function vect_verify_datarefs_alignment
865 Return TRUE if all data references in the loop can be
866 handled with respect to alignment. */
868 bool
869 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
871 vec<data_reference_p> datarefs;
872 struct data_reference *dr;
873 enum dr_alignment_support supportable_dr_alignment;
874 unsigned int i;
876 if (loop_vinfo)
877 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
878 else
879 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
881 FOR_EACH_VEC_ELT (datarefs, i, dr)
883 gimple stmt = DR_STMT (dr);
884 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
886 if (!STMT_VINFO_RELEVANT_P (stmt_info))
887 continue;
889 /* For interleaving, only the alignment of the first access matters.
890 Skip statements marked as not vectorizable. */
891 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
892 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
893 || !STMT_VINFO_VECTORIZABLE (stmt_info))
894 continue;
896 /* Strided loads perform only component accesses, alignment is
897 irrelevant for them. */
898 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
899 continue;
901 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
902 if (!supportable_dr_alignment)
904 if (dump_enabled_p ())
906 if (DR_IS_READ (dr))
907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
908 "not vectorized: unsupported unaligned load.");
909 else
910 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
911 "not vectorized: unsupported unaligned "
912 "store.");
914 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
915 DR_REF (dr));
916 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
918 return false;
920 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
921 dump_printf_loc (MSG_NOTE, vect_location,
922 "Vectorizing an unaligned access.\n");
924 return true;
927 /* Given an memory reference EXP return whether its alignment is less
928 than its size. */
930 static bool
931 not_size_aligned (tree exp)
933 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
934 return true;
936 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
937 > get_object_alignment (exp));
940 /* Function vector_alignment_reachable_p
942 Return true if vector alignment for DR is reachable by peeling
943 a few loop iterations. Return false otherwise. */
945 static bool
946 vector_alignment_reachable_p (struct data_reference *dr)
948 gimple stmt = DR_STMT (dr);
949 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
950 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
952 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
954 /* For interleaved access we peel only if number of iterations in
955 the prolog loop ({VF - misalignment}), is a multiple of the
956 number of the interleaved accesses. */
957 int elem_size, mis_in_elements;
958 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
960 /* FORNOW: handle only known alignment. */
961 if (!known_alignment_for_access_p (dr))
962 return false;
964 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
965 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
967 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
968 return false;
971 /* If misalignment is known at the compile time then allow peeling
972 only if natural alignment is reachable through peeling. */
973 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
975 HOST_WIDE_INT elmsize =
976 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
977 if (dump_enabled_p ())
979 dump_printf_loc (MSG_NOTE, vect_location,
980 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
981 dump_printf (MSG_NOTE,
982 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
984 if (DR_MISALIGNMENT (dr) % elmsize)
986 if (dump_enabled_p ())
987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
988 "data size does not divide the misalignment.\n");
989 return false;
993 if (!known_alignment_for_access_p (dr))
995 tree type = TREE_TYPE (DR_REF (dr));
996 bool is_packed = not_size_aligned (DR_REF (dr));
997 if (dump_enabled_p ())
998 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
999 "Unknown misalignment, is_packed = %d\n",is_packed);
1000 if ((TYPE_USER_ALIGN (type) && !is_packed)
1001 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1002 return true;
1003 else
1004 return false;
1007 return true;
1011 /* Calculate the cost of the memory access represented by DR. */
1013 static void
1014 vect_get_data_access_cost (struct data_reference *dr,
1015 unsigned int *inside_cost,
1016 unsigned int *outside_cost,
1017 stmt_vector_for_cost *body_cost_vec)
1019 gimple stmt = DR_STMT (dr);
1020 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1021 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1022 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1023 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1024 int ncopies = vf / nunits;
1026 if (DR_IS_READ (dr))
1027 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1028 NULL, body_cost_vec, false);
1029 else
1030 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1032 if (dump_enabled_p ())
1033 dump_printf_loc (MSG_NOTE, vect_location,
1034 "vect_get_data_access_cost: inside_cost = %d, "
1035 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1039 /* Insert DR into peeling hash table with NPEEL as key. */
1041 static void
1042 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1043 int npeel)
1045 struct _vect_peel_info elem, *slot;
1046 _vect_peel_info **new_slot;
1047 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1049 elem.npeel = npeel;
1050 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
1051 if (slot)
1052 slot->count++;
1053 else
1055 slot = XNEW (struct _vect_peel_info);
1056 slot->npeel = npeel;
1057 slot->dr = dr;
1058 slot->count = 1;
1059 new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
1060 *new_slot = slot;
1063 if (!supportable_dr_alignment
1064 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1065 slot->count += VECT_MAX_COST;
1069 /* Traverse peeling hash table to find peeling option that aligns maximum
1070 number of data accesses. */
1073 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1074 _vect_peel_extended_info *max)
1076 vect_peel_info elem = *slot;
1078 if (elem->count > max->peel_info.count
1079 || (elem->count == max->peel_info.count
1080 && max->peel_info.npeel > elem->npeel))
1082 max->peel_info.npeel = elem->npeel;
1083 max->peel_info.count = elem->count;
1084 max->peel_info.dr = elem->dr;
1087 return 1;
1091 /* Traverse peeling hash table and calculate cost for each peeling option.
1092 Find the one with the lowest cost. */
1095 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1096 _vect_peel_extended_info *min)
1098 vect_peel_info elem = *slot;
1099 int save_misalignment, dummy;
1100 unsigned int inside_cost = 0, outside_cost = 0, i;
1101 gimple stmt = DR_STMT (elem->dr);
1102 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1103 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1104 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1105 struct data_reference *dr;
1106 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1107 int single_iter_cost;
1109 prologue_cost_vec.create (2);
1110 body_cost_vec.create (2);
1111 epilogue_cost_vec.create (2);
1113 FOR_EACH_VEC_ELT (datarefs, i, dr)
1115 stmt = DR_STMT (dr);
1116 stmt_info = vinfo_for_stmt (stmt);
1117 /* For interleaving, only the alignment of the first access
1118 matters. */
1119 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1120 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1121 continue;
1123 save_misalignment = DR_MISALIGNMENT (dr);
1124 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1125 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1126 &body_cost_vec);
1127 SET_DR_MISALIGNMENT (dr, save_misalignment);
1130 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1131 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1132 &dummy, single_iter_cost,
1133 &prologue_cost_vec,
1134 &epilogue_cost_vec);
1136 /* Prologue and epilogue costs are added to the target model later.
1137 These costs depend only on the scalar iteration cost, the
1138 number of peeling iterations finally chosen, and the number of
1139 misaligned statements. So discard the information found here. */
1140 prologue_cost_vec.release ();
1141 epilogue_cost_vec.release ();
1143 if (inside_cost < min->inside_cost
1144 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1146 min->inside_cost = inside_cost;
1147 min->outside_cost = outside_cost;
1148 min->body_cost_vec.release ();
1149 min->body_cost_vec = body_cost_vec;
1150 min->peel_info.dr = elem->dr;
1151 min->peel_info.npeel = elem->npeel;
1153 else
1154 body_cost_vec.release ();
1156 return 1;
1160 /* Choose best peeling option by traversing peeling hash table and either
1161 choosing an option with the lowest cost (if cost model is enabled) or the
1162 option that aligns as many accesses as possible. */
1164 static struct data_reference *
1165 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1166 unsigned int *npeel,
1167 stmt_vector_for_cost *body_cost_vec)
1169 struct _vect_peel_extended_info res;
1171 res.peel_info.dr = NULL;
1172 res.body_cost_vec = stmt_vector_for_cost ();
1174 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1176 res.inside_cost = INT_MAX;
1177 res.outside_cost = INT_MAX;
1178 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1179 .traverse <_vect_peel_extended_info *,
1180 vect_peeling_hash_get_lowest_cost> (&res);
1182 else
1184 res.peel_info.count = 0;
1185 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1186 .traverse <_vect_peel_extended_info *,
1187 vect_peeling_hash_get_most_frequent> (&res);
1190 *npeel = res.peel_info.npeel;
1191 *body_cost_vec = res.body_cost_vec;
1192 return res.peel_info.dr;
1196 /* Function vect_enhance_data_refs_alignment
1198 This pass will use loop versioning and loop peeling in order to enhance
1199 the alignment of data references in the loop.
1201 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1202 original loop is to be vectorized. Any other loops that are created by
1203 the transformations performed in this pass - are not supposed to be
1204 vectorized. This restriction will be relaxed.
1206 This pass will require a cost model to guide it whether to apply peeling
1207 or versioning or a combination of the two. For example, the scheme that
1208 intel uses when given a loop with several memory accesses, is as follows:
1209 choose one memory access ('p') which alignment you want to force by doing
1210 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1211 other accesses are not necessarily aligned, or (2) use loop versioning to
1212 generate one loop in which all accesses are aligned, and another loop in
1213 which only 'p' is necessarily aligned.
1215 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1216 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1217 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1219 Devising a cost model is the most critical aspect of this work. It will
1220 guide us on which access to peel for, whether to use loop versioning, how
1221 many versions to create, etc. The cost model will probably consist of
1222 generic considerations as well as target specific considerations (on
1223 powerpc for example, misaligned stores are more painful than misaligned
1224 loads).
1226 Here are the general steps involved in alignment enhancements:
1228 -- original loop, before alignment analysis:
1229 for (i=0; i<N; i++){
1230 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1231 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1234 -- After vect_compute_data_refs_alignment:
1235 for (i=0; i<N; i++){
1236 x = q[i]; # DR_MISALIGNMENT(q) = 3
1237 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1240 -- Possibility 1: we do loop versioning:
1241 if (p is aligned) {
1242 for (i=0; i<N; i++){ # loop 1A
1243 x = q[i]; # DR_MISALIGNMENT(q) = 3
1244 p[i] = y; # DR_MISALIGNMENT(p) = 0
1247 else {
1248 for (i=0; i<N; i++){ # loop 1B
1249 x = q[i]; # DR_MISALIGNMENT(q) = 3
1250 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1254 -- Possibility 2: we do loop peeling:
1255 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1256 x = q[i];
1257 p[i] = y;
1259 for (i = 3; i < N; i++){ # loop 2A
1260 x = q[i]; # DR_MISALIGNMENT(q) = 0
1261 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1264 -- Possibility 3: combination of loop peeling and versioning:
1265 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1266 x = q[i];
1267 p[i] = y;
1269 if (p is aligned) {
1270 for (i = 3; i<N; i++){ # loop 3A
1271 x = q[i]; # DR_MISALIGNMENT(q) = 0
1272 p[i] = y; # DR_MISALIGNMENT(p) = 0
1275 else {
1276 for (i = 3; i<N; i++){ # loop 3B
1277 x = q[i]; # DR_MISALIGNMENT(q) = 0
1278 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1282 These loops are later passed to loop_transform to be vectorized. The
1283 vectorizer will use the alignment information to guide the transformation
1284 (whether to generate regular loads/stores, or with special handling for
1285 misalignment). */
1287 bool
1288 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1290 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1291 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1292 enum dr_alignment_support supportable_dr_alignment;
1293 struct data_reference *dr0 = NULL, *first_store = NULL;
1294 struct data_reference *dr;
1295 unsigned int i, j;
1296 bool do_peeling = false;
1297 bool do_versioning = false;
1298 bool stat;
1299 gimple stmt;
1300 stmt_vec_info stmt_info;
1301 unsigned int npeel = 0;
1302 bool all_misalignments_unknown = true;
1303 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1304 unsigned possible_npeel_number = 1;
1305 tree vectype;
1306 unsigned int nelements, mis, same_align_drs_max = 0;
1307 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1309 if (dump_enabled_p ())
1310 dump_printf_loc (MSG_NOTE, vect_location,
1311 "=== vect_enhance_data_refs_alignment ===\n");
1313 /* While cost model enhancements are expected in the future, the high level
1314 view of the code at this time is as follows:
1316 A) If there is a misaligned access then see if peeling to align
1317 this access can make all data references satisfy
1318 vect_supportable_dr_alignment. If so, update data structures
1319 as needed and return true.
1321 B) If peeling wasn't possible and there is a data reference with an
1322 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1323 then see if loop versioning checks can be used to make all data
1324 references satisfy vect_supportable_dr_alignment. If so, update
1325 data structures as needed and return true.
1327 C) If neither peeling nor versioning were successful then return false if
1328 any data reference does not satisfy vect_supportable_dr_alignment.
1330 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1332 Note, Possibility 3 above (which is peeling and versioning together) is not
1333 being done at this time. */
1335 /* (1) Peeling to force alignment. */
1337 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1338 Considerations:
1339 + How many accesses will become aligned due to the peeling
1340 - How many accesses will become unaligned due to the peeling,
1341 and the cost of misaligned accesses.
1342 - The cost of peeling (the extra runtime checks, the increase
1343 in code size). */
1345 FOR_EACH_VEC_ELT (datarefs, i, dr)
1347 stmt = DR_STMT (dr);
1348 stmt_info = vinfo_for_stmt (stmt);
1350 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1351 continue;
1353 /* For interleaving, only the alignment of the first access
1354 matters. */
1355 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1356 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1357 continue;
1359 /* For invariant accesses there is nothing to enhance. */
1360 if (integer_zerop (DR_STEP (dr)))
1361 continue;
1363 /* Strided loads perform only component accesses, alignment is
1364 irrelevant for them. */
1365 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1366 continue;
1368 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1369 do_peeling = vector_alignment_reachable_p (dr);
1370 if (do_peeling)
1372 if (known_alignment_for_access_p (dr))
1374 unsigned int npeel_tmp;
1375 bool negative = tree_int_cst_compare (DR_STEP (dr),
1376 size_zero_node) < 0;
1378 /* Save info about DR in the hash table. */
1379 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1380 LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
1382 vectype = STMT_VINFO_VECTYPE (stmt_info);
1383 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1384 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1385 TREE_TYPE (DR_REF (dr))));
1386 npeel_tmp = (negative
1387 ? (mis - nelements) : (nelements - mis))
1388 & (nelements - 1);
1390 /* For multiple types, it is possible that the bigger type access
1391 will have more than one peeling option. E.g., a loop with two
1392 types: one of size (vector size / 4), and the other one of
1393 size (vector size / 8). Vectorization factor will 8. If both
1394 access are misaligned by 3, the first one needs one scalar
1395 iteration to be aligned, and the second one needs 5. But the
1396 the first one will be aligned also by peeling 5 scalar
1397 iterations, and in that case both accesses will be aligned.
1398 Hence, except for the immediate peeling amount, we also want
1399 to try to add full vector size, while we don't exceed
1400 vectorization factor.
1401 We do this automtically for cost model, since we calculate cost
1402 for every peeling option. */
1403 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1404 possible_npeel_number = vf /nelements;
1406 /* Handle the aligned case. We may decide to align some other
1407 access, making DR unaligned. */
1408 if (DR_MISALIGNMENT (dr) == 0)
1410 npeel_tmp = 0;
1411 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1412 possible_npeel_number++;
1415 for (j = 0; j < possible_npeel_number; j++)
1417 gcc_assert (npeel_tmp <= vf);
1418 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1419 npeel_tmp += nelements;
1422 all_misalignments_unknown = false;
1423 /* Data-ref that was chosen for the case that all the
1424 misalignments are unknown is not relevant anymore, since we
1425 have a data-ref with known alignment. */
1426 dr0 = NULL;
1428 else
1430 /* If we don't know any misalignment values, we prefer
1431 peeling for data-ref that has the maximum number of data-refs
1432 with the same alignment, unless the target prefers to align
1433 stores over load. */
1434 if (all_misalignments_unknown)
1436 unsigned same_align_drs
1437 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1438 if (!dr0
1439 || same_align_drs_max < same_align_drs)
1441 same_align_drs_max = same_align_drs;
1442 dr0 = dr;
1444 /* For data-refs with the same number of related
1445 accesses prefer the one where the misalign
1446 computation will be invariant in the outermost loop. */
1447 else if (same_align_drs_max == same_align_drs)
1449 struct loop *ivloop0, *ivloop;
1450 ivloop0 = outermost_invariant_loop_for_expr
1451 (loop, DR_BASE_ADDRESS (dr0));
1452 ivloop = outermost_invariant_loop_for_expr
1453 (loop, DR_BASE_ADDRESS (dr));
1454 if ((ivloop && !ivloop0)
1455 || (ivloop && ivloop0
1456 && flow_loop_nested_p (ivloop, ivloop0)))
1457 dr0 = dr;
1460 if (!first_store && DR_IS_WRITE (dr))
1461 first_store = dr;
1464 /* If there are both known and unknown misaligned accesses in the
1465 loop, we choose peeling amount according to the known
1466 accesses. */
1467 if (!supportable_dr_alignment)
1469 dr0 = dr;
1470 if (!first_store && DR_IS_WRITE (dr))
1471 first_store = dr;
1475 else
1477 if (!aligned_access_p (dr))
1479 if (dump_enabled_p ())
1480 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1481 "vector alignment may not be reachable\n");
1482 break;
1487 /* Check if we can possibly peel the loop. */
1488 if (!vect_can_advance_ivs_p (loop_vinfo)
1489 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1490 do_peeling = false;
1492 if (do_peeling && all_misalignments_unknown
1493 && vect_supportable_dr_alignment (dr0, false))
1496 /* Check if the target requires to prefer stores over loads, i.e., if
1497 misaligned stores are more expensive than misaligned loads (taking
1498 drs with same alignment into account). */
1499 if (first_store && DR_IS_READ (dr0))
1501 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1502 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1503 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1504 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1505 stmt_vector_for_cost dummy;
1506 dummy.create (2);
1508 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1509 &dummy);
1510 vect_get_data_access_cost (first_store, &store_inside_cost,
1511 &store_outside_cost, &dummy);
1513 dummy.release ();
1515 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1516 aligning the load DR0). */
1517 load_inside_penalty = store_inside_cost;
1518 load_outside_penalty = store_outside_cost;
1519 for (i = 0;
1520 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1521 DR_STMT (first_store))).iterate (i, &dr);
1522 i++)
1523 if (DR_IS_READ (dr))
1525 load_inside_penalty += load_inside_cost;
1526 load_outside_penalty += load_outside_cost;
1528 else
1530 load_inside_penalty += store_inside_cost;
1531 load_outside_penalty += store_outside_cost;
1534 /* Calculate the penalty for leaving DR0 unaligned (by
1535 aligning the FIRST_STORE). */
1536 store_inside_penalty = load_inside_cost;
1537 store_outside_penalty = load_outside_cost;
1538 for (i = 0;
1539 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1540 DR_STMT (dr0))).iterate (i, &dr);
1541 i++)
1542 if (DR_IS_READ (dr))
1544 store_inside_penalty += load_inside_cost;
1545 store_outside_penalty += load_outside_cost;
1547 else
1549 store_inside_penalty += store_inside_cost;
1550 store_outside_penalty += store_outside_cost;
1553 if (load_inside_penalty > store_inside_penalty
1554 || (load_inside_penalty == store_inside_penalty
1555 && load_outside_penalty > store_outside_penalty))
1556 dr0 = first_store;
1559 /* In case there are only loads with different unknown misalignments, use
1560 peeling only if it may help to align other accesses in the loop. */
1561 if (!first_store
1562 && !STMT_VINFO_SAME_ALIGN_REFS (
1563 vinfo_for_stmt (DR_STMT (dr0))).length ()
1564 && vect_supportable_dr_alignment (dr0, false)
1565 != dr_unaligned_supported)
1566 do_peeling = false;
1569 if (do_peeling && !dr0)
1571 /* Peeling is possible, but there is no data access that is not supported
1572 unless aligned. So we try to choose the best possible peeling. */
1574 /* We should get here only if there are drs with known misalignment. */
1575 gcc_assert (!all_misalignments_unknown);
1577 /* Choose the best peeling from the hash table. */
1578 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1579 &body_cost_vec);
1580 if (!dr0 || !npeel)
1581 do_peeling = false;
1584 if (do_peeling)
1586 stmt = DR_STMT (dr0);
1587 stmt_info = vinfo_for_stmt (stmt);
1588 vectype = STMT_VINFO_VECTYPE (stmt_info);
1589 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1591 if (known_alignment_for_access_p (dr0))
1593 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1594 size_zero_node) < 0;
1595 if (!npeel)
1597 /* Since it's known at compile time, compute the number of
1598 iterations in the peeled loop (the peeling factor) for use in
1599 updating DR_MISALIGNMENT values. The peeling factor is the
1600 vectorization factor minus the misalignment as an element
1601 count. */
1602 mis = DR_MISALIGNMENT (dr0);
1603 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1604 npeel = ((negative ? mis - nelements : nelements - mis)
1605 & (nelements - 1));
1608 /* For interleaved data access every iteration accesses all the
1609 members of the group, therefore we divide the number of iterations
1610 by the group size. */
1611 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1612 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1613 npeel /= GROUP_SIZE (stmt_info);
1615 if (dump_enabled_p ())
1616 dump_printf_loc (MSG_NOTE, vect_location,
1617 "Try peeling by %d\n", npeel);
1620 /* Ensure that all data refs can be vectorized after the peel. */
1621 FOR_EACH_VEC_ELT (datarefs, i, dr)
1623 int save_misalignment;
1625 if (dr == dr0)
1626 continue;
1628 stmt = DR_STMT (dr);
1629 stmt_info = vinfo_for_stmt (stmt);
1630 /* For interleaving, only the alignment of the first access
1631 matters. */
1632 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1633 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1634 continue;
1636 /* Strided loads perform only component accesses, alignment is
1637 irrelevant for them. */
1638 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1639 continue;
1641 save_misalignment = DR_MISALIGNMENT (dr);
1642 vect_update_misalignment_for_peel (dr, dr0, npeel);
1643 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1644 SET_DR_MISALIGNMENT (dr, save_misalignment);
1646 if (!supportable_dr_alignment)
1648 do_peeling = false;
1649 break;
1653 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1655 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1656 if (!stat)
1657 do_peeling = false;
1658 else
1660 body_cost_vec.release ();
1661 return stat;
1665 if (do_peeling)
1667 unsigned max_allowed_peel
1668 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1669 if (max_allowed_peel != (unsigned)-1)
1671 unsigned max_peel = npeel;
1672 if (max_peel == 0)
1674 gimple dr_stmt = DR_STMT (dr0);
1675 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1676 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1677 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1679 if (max_peel > max_allowed_peel)
1681 do_peeling = false;
1682 if (dump_enabled_p ())
1683 dump_printf_loc (MSG_NOTE, vect_location,
1684 "Disable peeling, max peels reached: %d\n", max_peel);
1689 if (do_peeling)
1691 stmt_info_for_cost *si;
1692 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1694 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1695 If the misalignment of DR_i is identical to that of dr0 then set
1696 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1697 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1698 by the peeling factor times the element size of DR_i (MOD the
1699 vectorization factor times the size). Otherwise, the
1700 misalignment of DR_i must be set to unknown. */
1701 FOR_EACH_VEC_ELT (datarefs, i, dr)
1702 if (dr != dr0)
1703 vect_update_misalignment_for_peel (dr, dr0, npeel);
1705 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1706 if (npeel)
1707 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1708 else
1709 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1710 = DR_MISALIGNMENT (dr0);
1711 SET_DR_MISALIGNMENT (dr0, 0);
1712 if (dump_enabled_p ())
1714 dump_printf_loc (MSG_NOTE, vect_location,
1715 "Alignment of access forced using peeling.\n");
1716 dump_printf_loc (MSG_NOTE, vect_location,
1717 "Peeling for alignment will be applied.\n");
1719 /* We've delayed passing the inside-loop peeling costs to the
1720 target cost model until we were sure peeling would happen.
1721 Do so now. */
1722 if (body_cost_vec.exists ())
1724 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1726 struct _stmt_vec_info *stmt_info
1727 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1728 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1729 si->misalign, vect_body);
1731 body_cost_vec.release ();
1734 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1735 gcc_assert (stat);
1736 return stat;
1740 body_cost_vec.release ();
1742 /* (2) Versioning to force alignment. */
1744 /* Try versioning if:
1745 1) optimize loop for speed
1746 2) there is at least one unsupported misaligned data ref with an unknown
1747 misalignment, and
1748 3) all misaligned data refs with a known misalignment are supported, and
1749 4) the number of runtime alignment checks is within reason. */
1751 do_versioning =
1752 optimize_loop_nest_for_speed_p (loop)
1753 && (!loop->inner); /* FORNOW */
1755 if (do_versioning)
1757 FOR_EACH_VEC_ELT (datarefs, i, dr)
1759 stmt = DR_STMT (dr);
1760 stmt_info = vinfo_for_stmt (stmt);
1762 /* For interleaving, only the alignment of the first access
1763 matters. */
1764 if (aligned_access_p (dr)
1765 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1766 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1767 continue;
1769 /* Strided loads perform only component accesses, alignment is
1770 irrelevant for them. */
1771 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1772 continue;
1774 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1776 if (!supportable_dr_alignment)
1778 gimple stmt;
1779 int mask;
1780 tree vectype;
1782 if (known_alignment_for_access_p (dr)
1783 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1784 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1786 do_versioning = false;
1787 break;
1790 stmt = DR_STMT (dr);
1791 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1792 gcc_assert (vectype);
1794 /* The rightmost bits of an aligned address must be zeros.
1795 Construct the mask needed for this test. For example,
1796 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1797 mask must be 15 = 0xf. */
1798 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1800 /* FORNOW: use the same mask to test all potentially unaligned
1801 references in the loop. The vectorizer currently supports
1802 a single vector size, see the reference to
1803 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1804 vectorization factor is computed. */
1805 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1806 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1807 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1808 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1809 DR_STMT (dr));
1813 /* Versioning requires at least one misaligned data reference. */
1814 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1815 do_versioning = false;
1816 else if (!do_versioning)
1817 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1820 if (do_versioning)
1822 vec<gimple> may_misalign_stmts
1823 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1824 gimple stmt;
1826 /* It can now be assumed that the data references in the statements
1827 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1828 of the loop being vectorized. */
1829 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1831 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1832 dr = STMT_VINFO_DATA_REF (stmt_info);
1833 SET_DR_MISALIGNMENT (dr, 0);
1834 if (dump_enabled_p ())
1835 dump_printf_loc (MSG_NOTE, vect_location,
1836 "Alignment of access forced using versioning.\n");
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_NOTE, vect_location,
1841 "Versioning for alignment will be applied.\n");
1843 /* Peeling and versioning can't be done together at this time. */
1844 gcc_assert (! (do_peeling && do_versioning));
1846 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1847 gcc_assert (stat);
1848 return stat;
1851 /* This point is reached if neither peeling nor versioning is being done. */
1852 gcc_assert (! (do_peeling || do_versioning));
1854 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1855 return stat;
1859 /* Function vect_find_same_alignment_drs.
1861 Update group and alignment relations according to the chosen
1862 vectorization factor. */
1864 static void
1865 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1866 loop_vec_info loop_vinfo)
1868 unsigned int i;
1869 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1870 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1871 struct data_reference *dra = DDR_A (ddr);
1872 struct data_reference *drb = DDR_B (ddr);
1873 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1874 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1875 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1876 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1877 lambda_vector dist_v;
1878 unsigned int loop_depth;
1880 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1881 return;
1883 if (dra == drb)
1884 return;
1886 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1887 return;
1889 /* Loop-based vectorization and known data dependence. */
1890 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1891 return;
1893 /* Data-dependence analysis reports a distance vector of zero
1894 for data-references that overlap only in the first iteration
1895 but have different sign step (see PR45764).
1896 So as a sanity check require equal DR_STEP. */
1897 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1898 return;
1900 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1901 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1903 int dist = dist_v[loop_depth];
1905 if (dump_enabled_p ())
1906 dump_printf_loc (MSG_NOTE, vect_location,
1907 "dependence distance = %d.\n", dist);
1909 /* Same loop iteration. */
1910 if (dist == 0
1911 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1913 /* Two references with distance zero have the same alignment. */
1914 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1915 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1916 if (dump_enabled_p ())
1918 dump_printf_loc (MSG_NOTE, vect_location,
1919 "accesses have the same alignment.\n");
1920 dump_printf (MSG_NOTE,
1921 "dependence distance modulo vf == 0 between ");
1922 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1923 dump_printf (MSG_NOTE, " and ");
1924 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1925 dump_printf (MSG_NOTE, "\n");
1932 /* Function vect_analyze_data_refs_alignment
1934 Analyze the alignment of the data-references in the loop.
1935 Return FALSE if a data reference is found that cannot be vectorized. */
1937 bool
1938 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1939 bb_vec_info bb_vinfo)
1941 if (dump_enabled_p ())
1942 dump_printf_loc (MSG_NOTE, vect_location,
1943 "=== vect_analyze_data_refs_alignment ===\n");
1945 /* Mark groups of data references with same alignment using
1946 data dependence information. */
1947 if (loop_vinfo)
1949 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1950 struct data_dependence_relation *ddr;
1951 unsigned int i;
1953 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1954 vect_find_same_alignment_drs (ddr, loop_vinfo);
1957 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1959 if (dump_enabled_p ())
1960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1961 "not vectorized: can't calculate alignment "
1962 "for data ref.\n");
1963 return false;
1966 return true;
1970 /* Analyze groups of accesses: check that DR belongs to a group of
1971 accesses of legal size, step, etc. Detect gaps, single element
1972 interleaving, and other special cases. Set grouped access info.
1973 Collect groups of strided stores for further use in SLP analysis. */
1975 static bool
1976 vect_analyze_group_access (struct data_reference *dr)
1978 tree step = DR_STEP (dr);
1979 tree scalar_type = TREE_TYPE (DR_REF (dr));
1980 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
1981 gimple stmt = DR_STMT (dr);
1982 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1983 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1984 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
1985 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
1986 HOST_WIDE_INT groupsize, last_accessed_element = 1;
1987 bool slp_impossible = false;
1988 struct loop *loop = NULL;
1990 if (loop_vinfo)
1991 loop = LOOP_VINFO_LOOP (loop_vinfo);
1993 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
1994 size of the interleaving group (including gaps). */
1995 groupsize = absu_hwi (dr_step) / type_size;
1997 /* Not consecutive access is possible only if it is a part of interleaving. */
1998 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2000 /* Check if it this DR is a part of interleaving, and is a single
2001 element of the group that is accessed in the loop. */
2003 /* Gaps are supported only for loads. STEP must be a multiple of the type
2004 size. The size of the group must be a power of 2. */
2005 if (DR_IS_READ (dr)
2006 && (dr_step % type_size) == 0
2007 && groupsize > 0
2008 && exact_log2 (groupsize) != -1)
2010 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2011 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2012 if (dump_enabled_p ())
2014 dump_printf_loc (MSG_NOTE, vect_location,
2015 "Detected single element interleaving ");
2016 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2017 dump_printf (MSG_NOTE, " step ");
2018 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2019 dump_printf (MSG_NOTE, "\n");
2022 if (loop_vinfo)
2024 if (dump_enabled_p ())
2025 dump_printf_loc (MSG_NOTE, vect_location,
2026 "Data access with gaps requires scalar "
2027 "epilogue loop\n");
2028 if (loop->inner)
2030 if (dump_enabled_p ())
2031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2032 "Peeling for outer loop is not"
2033 " supported\n");
2034 return false;
2037 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2040 return true;
2043 if (dump_enabled_p ())
2045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2046 "not consecutive access ");
2047 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2048 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2051 if (bb_vinfo)
2053 /* Mark the statement as unvectorizable. */
2054 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2055 return true;
2058 return false;
2061 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2063 /* First stmt in the interleaving chain. Check the chain. */
2064 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2065 struct data_reference *data_ref = dr;
2066 unsigned int count = 1;
2067 tree prev_init = DR_INIT (data_ref);
2068 gimple prev = stmt;
2069 HOST_WIDE_INT diff, gaps = 0;
2070 unsigned HOST_WIDE_INT count_in_bytes;
2072 while (next)
2074 /* Skip same data-refs. In case that two or more stmts share
2075 data-ref (supported only for loads), we vectorize only the first
2076 stmt, and the rest get their vectorized loads from the first
2077 one. */
2078 if (!tree_int_cst_compare (DR_INIT (data_ref),
2079 DR_INIT (STMT_VINFO_DATA_REF (
2080 vinfo_for_stmt (next)))))
2082 if (DR_IS_WRITE (data_ref))
2084 if (dump_enabled_p ())
2085 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2086 "Two store stmts share the same dr.\n");
2087 return false;
2090 /* For load use the same data-ref load. */
2091 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2093 prev = next;
2094 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2095 continue;
2098 prev = next;
2099 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2101 /* All group members have the same STEP by construction. */
2102 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2104 /* Check that the distance between two accesses is equal to the type
2105 size. Otherwise, we have gaps. */
2106 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2107 - TREE_INT_CST_LOW (prev_init)) / type_size;
2108 if (diff != 1)
2110 /* FORNOW: SLP of accesses with gaps is not supported. */
2111 slp_impossible = true;
2112 if (DR_IS_WRITE (data_ref))
2114 if (dump_enabled_p ())
2115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2116 "interleaved store with gaps\n");
2117 return false;
2120 gaps += diff - 1;
2123 last_accessed_element += diff;
2125 /* Store the gap from the previous member of the group. If there is no
2126 gap in the access, GROUP_GAP is always 1. */
2127 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2129 prev_init = DR_INIT (data_ref);
2130 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2131 /* Count the number of data-refs in the chain. */
2132 count++;
2135 /* COUNT is the number of accesses found, we multiply it by the size of
2136 the type to get COUNT_IN_BYTES. */
2137 count_in_bytes = type_size * count;
2139 /* Check that the size of the interleaving (including gaps) is not
2140 greater than STEP. */
2141 if (dr_step != 0
2142 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2144 if (dump_enabled_p ())
2146 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147 "interleaving size is greater than step for ");
2148 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2149 DR_REF (dr));
2150 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2152 return false;
2155 /* Check that the size of the interleaving is equal to STEP for stores,
2156 i.e., that there are no gaps. */
2157 if (dr_step != 0
2158 && absu_hwi (dr_step) != count_in_bytes)
2160 if (DR_IS_READ (dr))
2162 slp_impossible = true;
2163 /* There is a gap after the last load in the group. This gap is a
2164 difference between the groupsize and the number of elements.
2165 When there is no gap, this difference should be 0. */
2166 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2168 else
2170 if (dump_enabled_p ())
2171 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2172 "interleaved store with gaps\n");
2173 return false;
2177 /* Check that STEP is a multiple of type size. */
2178 if (dr_step != 0
2179 && (dr_step % type_size) != 0)
2181 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184 "step is not a multiple of type size: step ");
2185 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2186 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2187 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2188 TYPE_SIZE_UNIT (scalar_type));
2189 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2191 return false;
2194 if (groupsize == 0)
2195 groupsize = count;
2197 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2198 if (dump_enabled_p ())
2199 dump_printf_loc (MSG_NOTE, vect_location,
2200 "Detected interleaving of size %d\n", (int)groupsize);
2202 /* SLP: create an SLP data structure for every interleaving group of
2203 stores for further analysis in vect_analyse_slp. */
2204 if (DR_IS_WRITE (dr) && !slp_impossible)
2206 if (loop_vinfo)
2207 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2208 if (bb_vinfo)
2209 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2212 /* There is a gap in the end of the group. */
2213 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2215 if (dump_enabled_p ())
2216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2217 "Data access with gaps requires scalar "
2218 "epilogue loop\n");
2219 if (loop->inner)
2221 if (dump_enabled_p ())
2222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2223 "Peeling for outer loop is not supported\n");
2224 return false;
2227 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2231 return true;
2235 /* Analyze the access pattern of the data-reference DR.
2236 In case of non-consecutive accesses call vect_analyze_group_access() to
2237 analyze groups of accesses. */
2239 static bool
2240 vect_analyze_data_ref_access (struct data_reference *dr)
2242 tree step = DR_STEP (dr);
2243 tree scalar_type = TREE_TYPE (DR_REF (dr));
2244 gimple stmt = DR_STMT (dr);
2245 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2246 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2247 struct loop *loop = NULL;
2249 if (loop_vinfo)
2250 loop = LOOP_VINFO_LOOP (loop_vinfo);
2252 if (loop_vinfo && !step)
2254 if (dump_enabled_p ())
2255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2256 "bad data-ref access in loop\n");
2257 return false;
2260 /* Allow invariant loads in not nested loops. */
2261 if (loop_vinfo && integer_zerop (step))
2263 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2264 if (nested_in_vect_loop_p (loop, stmt))
2266 if (dump_enabled_p ())
2267 dump_printf_loc (MSG_NOTE, vect_location,
2268 "zero step in inner loop of nest\n");
2269 return false;
2271 return DR_IS_READ (dr);
2274 if (loop && nested_in_vect_loop_p (loop, stmt))
2276 /* Interleaved accesses are not yet supported within outer-loop
2277 vectorization for references in the inner-loop. */
2278 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2280 /* For the rest of the analysis we use the outer-loop step. */
2281 step = STMT_VINFO_DR_STEP (stmt_info);
2282 if (integer_zerop (step))
2284 if (dump_enabled_p ())
2285 dump_printf_loc (MSG_NOTE, vect_location,
2286 "zero step in outer loop.\n");
2287 if (DR_IS_READ (dr))
2288 return true;
2289 else
2290 return false;
2294 /* Consecutive? */
2295 if (TREE_CODE (step) == INTEGER_CST)
2297 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2298 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2299 || (dr_step < 0
2300 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2302 /* Mark that it is not interleaving. */
2303 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2304 return true;
2308 if (loop && nested_in_vect_loop_p (loop, stmt))
2310 if (dump_enabled_p ())
2311 dump_printf_loc (MSG_NOTE, vect_location,
2312 "grouped access in outer loop.\n");
2313 return false;
2316 /* Assume this is a DR handled by non-constant strided load case. */
2317 if (TREE_CODE (step) != INTEGER_CST)
2318 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2320 /* Not consecutive access - check if it's a part of interleaving group. */
2321 return vect_analyze_group_access (dr);
2326 /* A helper function used in the comparator function to sort data
2327 references. T1 and T2 are two data references to be compared.
2328 The function returns -1, 0, or 1. */
2330 static int
2331 compare_tree (tree t1, tree t2)
2333 int i, cmp;
2334 enum tree_code code;
2335 char tclass;
2337 if (t1 == t2)
2338 return 0;
2339 if (t1 == NULL)
2340 return -1;
2341 if (t2 == NULL)
2342 return 1;
2345 if (TREE_CODE (t1) != TREE_CODE (t2))
2346 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2348 code = TREE_CODE (t1);
2349 switch (code)
2351 /* For const values, we can just use hash values for comparisons. */
2352 case INTEGER_CST:
2353 case REAL_CST:
2354 case FIXED_CST:
2355 case STRING_CST:
2356 case COMPLEX_CST:
2357 case VECTOR_CST:
2359 hashval_t h1 = iterative_hash_expr (t1, 0);
2360 hashval_t h2 = iterative_hash_expr (t2, 0);
2361 if (h1 != h2)
2362 return h1 < h2 ? -1 : 1;
2363 break;
2366 case SSA_NAME:
2367 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2368 if (cmp != 0)
2369 return cmp;
2371 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2372 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2373 break;
2375 default:
2376 tclass = TREE_CODE_CLASS (code);
2378 /* For var-decl, we could compare their UIDs. */
2379 if (tclass == tcc_declaration)
2381 if (DECL_UID (t1) != DECL_UID (t2))
2382 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2383 break;
2386 /* For expressions with operands, compare their operands recursively. */
2387 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2389 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2390 if (cmp != 0)
2391 return cmp;
2395 return 0;
2399 /* Compare two data-references DRA and DRB to group them into chunks
2400 suitable for grouping. */
2402 static int
2403 dr_group_sort_cmp (const void *dra_, const void *drb_)
2405 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2406 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2407 int cmp;
2409 /* Stabilize sort. */
2410 if (dra == drb)
2411 return 0;
2413 /* Ordering of DRs according to base. */
2414 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2416 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2417 if (cmp != 0)
2418 return cmp;
2421 /* And according to DR_OFFSET. */
2422 if (!dr_equal_offsets_p (dra, drb))
2424 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2425 if (cmp != 0)
2426 return cmp;
2429 /* Put reads before writes. */
2430 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2431 return DR_IS_READ (dra) ? -1 : 1;
2433 /* Then sort after access size. */
2434 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2435 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2437 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2438 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2439 if (cmp != 0)
2440 return cmp;
2443 /* And after step. */
2444 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2446 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2447 if (cmp != 0)
2448 return cmp;
2451 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2452 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2453 if (cmp == 0)
2454 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2455 return cmp;
2458 /* Function vect_analyze_data_ref_accesses.
2460 Analyze the access pattern of all the data references in the loop.
2462 FORNOW: the only access pattern that is considered vectorizable is a
2463 simple step 1 (consecutive) access.
2465 FORNOW: handle only arrays and pointer accesses. */
2467 bool
2468 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2470 unsigned int i;
2471 vec<data_reference_p> datarefs;
2472 struct data_reference *dr;
2474 if (dump_enabled_p ())
2475 dump_printf_loc (MSG_NOTE, vect_location,
2476 "=== vect_analyze_data_ref_accesses ===\n");
2478 if (loop_vinfo)
2479 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2480 else
2481 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2483 if (datarefs.is_empty ())
2484 return true;
2486 /* Sort the array of datarefs to make building the interleaving chains
2487 linear. */
2488 qsort (datarefs.address (), datarefs.length (),
2489 sizeof (data_reference_p), dr_group_sort_cmp);
2491 /* Build the interleaving chains. */
2492 for (i = 0; i < datarefs.length () - 1;)
2494 data_reference_p dra = datarefs[i];
2495 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2496 stmt_vec_info lastinfo = NULL;
2497 for (i = i + 1; i < datarefs.length (); ++i)
2499 data_reference_p drb = datarefs[i];
2500 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2502 /* ??? Imperfect sorting (non-compatible types, non-modulo
2503 accesses, same accesses) can lead to a group to be artificially
2504 split here as we don't just skip over those. If it really
2505 matters we can push those to a worklist and re-iterate
2506 over them. The we can just skip ahead to the next DR here. */
2508 /* Check that the data-refs have same first location (except init)
2509 and they are both either store or load (not load and store). */
2510 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2511 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2512 DR_BASE_ADDRESS (drb), 0)
2513 || !dr_equal_offsets_p (dra, drb))
2514 break;
2516 /* Check that the data-refs have the same constant size and step. */
2517 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2518 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2519 if (!tree_fits_uhwi_p (sza)
2520 || !tree_fits_uhwi_p (szb)
2521 || !tree_int_cst_equal (sza, szb)
2522 || !tree_fits_shwi_p (DR_STEP (dra))
2523 || !tree_fits_shwi_p (DR_STEP (drb))
2524 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2525 break;
2527 /* Do not place the same access in the interleaving chain twice. */
2528 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2529 break;
2531 /* Check the types are compatible.
2532 ??? We don't distinguish this during sorting. */
2533 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2534 TREE_TYPE (DR_REF (drb))))
2535 break;
2537 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2538 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2539 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2540 gcc_assert (init_a < init_b);
2542 /* If init_b == init_a + the size of the type * k, we have an
2543 interleaving, and DRA is accessed before DRB. */
2544 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2545 if ((init_b - init_a) % type_size_a != 0)
2546 break;
2548 /* The step (if not zero) is greater than the difference between
2549 data-refs' inits. This splits groups into suitable sizes. */
2550 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2551 if (step != 0 && step <= (init_b - init_a))
2552 break;
2554 if (dump_enabled_p ())
2556 dump_printf_loc (MSG_NOTE, vect_location,
2557 "Detected interleaving ");
2558 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2559 dump_printf (MSG_NOTE, " and ");
2560 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2561 dump_printf (MSG_NOTE, "\n");
2564 /* Link the found element into the group list. */
2565 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2567 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2568 lastinfo = stmtinfo_a;
2570 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2571 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2572 lastinfo = stmtinfo_b;
2576 FOR_EACH_VEC_ELT (datarefs, i, dr)
2577 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2578 && !vect_analyze_data_ref_access (dr))
2580 if (dump_enabled_p ())
2581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2582 "not vectorized: complicated access pattern.\n");
2584 if (bb_vinfo)
2586 /* Mark the statement as not vectorizable. */
2587 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2588 continue;
2590 else
2591 return false;
2594 return true;
2598 /* Operator == between two dr_with_seg_len objects.
2600 This equality operator is used to make sure two data refs
2601 are the same one so that we will consider to combine the
2602 aliasing checks of those two pairs of data dependent data
2603 refs. */
2605 static bool
2606 operator == (const dr_with_seg_len& d1,
2607 const dr_with_seg_len& d2)
2609 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2610 DR_BASE_ADDRESS (d2.dr), 0)
2611 && compare_tree (d1.offset, d2.offset) == 0
2612 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2615 /* Function comp_dr_with_seg_len_pair.
2617 Comparison function for sorting objects of dr_with_seg_len_pair_t
2618 so that we can combine aliasing checks in one scan. */
2620 static int
2621 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2623 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2624 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2626 const dr_with_seg_len &p11 = p1->first,
2627 &p12 = p1->second,
2628 &p21 = p2->first,
2629 &p22 = p2->second;
2631 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2632 if a and c have the same basic address snd step, and b and d have the same
2633 address and step. Therefore, if any a&c or b&d don't have the same address
2634 and step, we don't care the order of those two pairs after sorting. */
2635 int comp_res;
2637 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2638 DR_BASE_ADDRESS (p21.dr))) != 0)
2639 return comp_res;
2640 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2641 DR_BASE_ADDRESS (p22.dr))) != 0)
2642 return comp_res;
2643 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2644 return comp_res;
2645 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2646 return comp_res;
2647 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2648 return comp_res;
2649 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2650 return comp_res;
2652 return 0;
2655 template <class T> static void
2656 swap (T& a, T& b)
2658 T c (a);
2659 a = b;
2660 b = c;
2663 /* Function vect_vfa_segment_size.
2665 Create an expression that computes the size of segment
2666 that will be accessed for a data reference. The functions takes into
2667 account that realignment loads may access one more vector.
2669 Input:
2670 DR: The data reference.
2671 LENGTH_FACTOR: segment length to consider.
2673 Return an expression whose value is the size of segment which will be
2674 accessed by DR. */
2676 static tree
2677 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2679 tree segment_length;
2681 if (integer_zerop (DR_STEP (dr)))
2682 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2683 else
2684 segment_length = size_binop (MULT_EXPR,
2685 fold_convert (sizetype, DR_STEP (dr)),
2686 fold_convert (sizetype, length_factor));
2688 if (vect_supportable_dr_alignment (dr, false)
2689 == dr_explicit_realign_optimized)
2691 tree vector_size = TYPE_SIZE_UNIT
2692 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2694 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2696 return segment_length;
2699 /* Function vect_prune_runtime_alias_test_list.
2701 Prune a list of ddrs to be tested at run-time by versioning for alias.
2702 Merge several alias checks into one if possible.
2703 Return FALSE if resulting list of ddrs is longer then allowed by
2704 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2706 bool
2707 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2709 vec<ddr_p> may_alias_ddrs =
2710 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2711 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2712 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2713 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2714 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2716 ddr_p ddr;
2717 unsigned int i;
2718 tree length_factor;
2720 if (dump_enabled_p ())
2721 dump_printf_loc (MSG_NOTE, vect_location,
2722 "=== vect_prune_runtime_alias_test_list ===\n");
2724 if (may_alias_ddrs.is_empty ())
2725 return true;
2727 /* Basically, for each pair of dependent data refs store_ptr_0
2728 and load_ptr_0, we create an expression:
2730 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2731 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2733 for aliasing checks. However, in some cases we can decrease
2734 the number of checks by combining two checks into one. For
2735 example, suppose we have another pair of data refs store_ptr_0
2736 and load_ptr_1, and if the following condition is satisfied:
2738 load_ptr_0 < load_ptr_1 &&
2739 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2741 (this condition means, in each iteration of vectorized loop,
2742 the accessed memory of store_ptr_0 cannot be between the memory
2743 of load_ptr_0 and load_ptr_1.)
2745 we then can use only the following expression to finish the
2746 alising checks between store_ptr_0 & load_ptr_0 and
2747 store_ptr_0 & load_ptr_1:
2749 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2750 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2752 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2753 same basic address. */
2755 comp_alias_ddrs.create (may_alias_ddrs.length ());
2757 /* First, we collect all data ref pairs for aliasing checks. */
2758 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2760 struct data_reference *dr_a, *dr_b;
2761 gimple dr_group_first_a, dr_group_first_b;
2762 tree segment_length_a, segment_length_b;
2763 gimple stmt_a, stmt_b;
2765 dr_a = DDR_A (ddr);
2766 stmt_a = DR_STMT (DDR_A (ddr));
2767 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2768 if (dr_group_first_a)
2770 stmt_a = dr_group_first_a;
2771 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2774 dr_b = DDR_B (ddr);
2775 stmt_b = DR_STMT (DDR_B (ddr));
2776 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2777 if (dr_group_first_b)
2779 stmt_b = dr_group_first_b;
2780 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2783 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2784 length_factor = scalar_loop_iters;
2785 else
2786 length_factor = size_int (vect_factor);
2787 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2788 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2790 dr_with_seg_len_pair_t dr_with_seg_len_pair
2791 (dr_with_seg_len (dr_a, segment_length_a),
2792 dr_with_seg_len (dr_b, segment_length_b));
2794 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2795 swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2797 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2800 /* Second, we sort the collected data ref pairs so that we can scan
2801 them once to combine all possible aliasing checks. */
2802 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2804 /* Third, we scan the sorted dr pairs and check if we can combine
2805 alias checks of two neighbouring dr pairs. */
2806 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2808 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2809 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2810 *dr_b1 = &comp_alias_ddrs[i-1].second,
2811 *dr_a2 = &comp_alias_ddrs[i].first,
2812 *dr_b2 = &comp_alias_ddrs[i].second;
2814 /* Remove duplicate data ref pairs. */
2815 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2817 if (dump_enabled_p ())
2819 dump_printf_loc (MSG_NOTE, vect_location,
2820 "found equal ranges ");
2821 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2822 DR_REF (dr_a1->dr));
2823 dump_printf (MSG_NOTE, ", ");
2824 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2825 DR_REF (dr_b1->dr));
2826 dump_printf (MSG_NOTE, " and ");
2827 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2828 DR_REF (dr_a2->dr));
2829 dump_printf (MSG_NOTE, ", ");
2830 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2831 DR_REF (dr_b2->dr));
2832 dump_printf (MSG_NOTE, "\n");
2835 comp_alias_ddrs.ordered_remove (i--);
2836 continue;
2839 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2841 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2842 and DR_A1 and DR_A2 are two consecutive memrefs. */
2843 if (*dr_a1 == *dr_a2)
2845 swap (dr_a1, dr_b1);
2846 swap (dr_a2, dr_b2);
2849 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2850 DR_BASE_ADDRESS (dr_a2->dr),
2852 || !tree_fits_shwi_p (dr_a1->offset)
2853 || !tree_fits_shwi_p (dr_a2->offset))
2854 continue;
2856 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2857 - tree_to_shwi (dr_a1->offset));
2860 /* Now we check if the following condition is satisfied:
2862 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2864 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2865 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2866 have to make a best estimation. We can get the minimum value
2867 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2868 then either of the following two conditions can guarantee the
2869 one above:
2871 1: DIFF <= MIN_SEG_LEN_B
2872 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2876 HOST_WIDE_INT
2877 min_seg_len_b = (TREE_CODE (dr_b1->seg_len) == INTEGER_CST) ?
2878 TREE_INT_CST_LOW (dr_b1->seg_len) :
2879 vect_factor;
2881 if (diff <= min_seg_len_b
2882 || (TREE_CODE (dr_a1->seg_len) == INTEGER_CST
2883 && diff - (HOST_WIDE_INT) TREE_INT_CST_LOW (dr_a1->seg_len) <
2884 min_seg_len_b))
2886 dr_a1->seg_len = size_binop (PLUS_EXPR,
2887 dr_a2->seg_len, size_int (diff));
2888 comp_alias_ddrs.ordered_remove (i--);
2893 if ((int) comp_alias_ddrs.length () >
2894 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2896 if (dump_enabled_p ())
2898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2899 "disable versioning for alias - max number of "
2900 "generated checks exceeded.\n");
2903 return false;
2906 return true;
2909 /* Check whether a non-affine read in stmt is suitable for gather load
2910 and if so, return a builtin decl for that operation. */
2912 tree
2913 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2914 tree *offp, int *scalep)
2916 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2917 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2918 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2919 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2920 tree offtype = NULL_TREE;
2921 tree decl, base, off;
2922 enum machine_mode pmode;
2923 int punsignedp, pvolatilep;
2925 base = DR_REF (dr);
2926 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2927 see if we can use the def stmt of the address. */
2928 if (is_gimple_call (stmt)
2929 && gimple_call_internal_p (stmt)
2930 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2931 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2932 && TREE_CODE (base) == MEM_REF
2933 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2934 && integer_zerop (TREE_OPERAND (base, 1))
2935 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2937 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2938 if (is_gimple_assign (def_stmt)
2939 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
2940 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2943 /* The gather builtins need address of the form
2944 loop_invariant + vector * {1, 2, 4, 8}
2946 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2947 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2948 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2949 multiplications and additions in it. To get a vector, we need
2950 a single SSA_NAME that will be defined in the loop and will
2951 contain everything that is not loop invariant and that can be
2952 vectorized. The following code attempts to find such a preexistng
2953 SSA_NAME OFF and put the loop invariants into a tree BASE
2954 that can be gimplified before the loop. */
2955 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
2956 &pmode, &punsignedp, &pvolatilep, false);
2957 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2959 if (TREE_CODE (base) == MEM_REF)
2961 if (!integer_zerop (TREE_OPERAND (base, 1)))
2963 if (off == NULL_TREE)
2965 double_int moff = mem_ref_offset (base);
2966 off = double_int_to_tree (sizetype, moff);
2968 else
2969 off = size_binop (PLUS_EXPR, off,
2970 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2972 base = TREE_OPERAND (base, 0);
2974 else
2975 base = build_fold_addr_expr (base);
2977 if (off == NULL_TREE)
2978 off = size_zero_node;
2980 /* If base is not loop invariant, either off is 0, then we start with just
2981 the constant offset in the loop invariant BASE and continue with base
2982 as OFF, otherwise give up.
2983 We could handle that case by gimplifying the addition of base + off
2984 into some SSA_NAME and use that as off, but for now punt. */
2985 if (!expr_invariant_in_loop_p (loop, base))
2987 if (!integer_zerop (off))
2988 return NULL_TREE;
2989 off = base;
2990 base = size_int (pbitpos / BITS_PER_UNIT);
2992 /* Otherwise put base + constant offset into the loop invariant BASE
2993 and continue with OFF. */
2994 else
2996 base = fold_convert (sizetype, base);
2997 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3000 /* OFF at this point may be either a SSA_NAME or some tree expression
3001 from get_inner_reference. Try to peel off loop invariants from it
3002 into BASE as long as possible. */
3003 STRIP_NOPS (off);
3004 while (offtype == NULL_TREE)
3006 enum tree_code code;
3007 tree op0, op1, add = NULL_TREE;
3009 if (TREE_CODE (off) == SSA_NAME)
3011 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3013 if (expr_invariant_in_loop_p (loop, off))
3014 return NULL_TREE;
3016 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3017 break;
3019 op0 = gimple_assign_rhs1 (def_stmt);
3020 code = gimple_assign_rhs_code (def_stmt);
3021 op1 = gimple_assign_rhs2 (def_stmt);
3023 else
3025 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3026 return NULL_TREE;
3027 code = TREE_CODE (off);
3028 extract_ops_from_tree (off, &code, &op0, &op1);
3030 switch (code)
3032 case POINTER_PLUS_EXPR:
3033 case PLUS_EXPR:
3034 if (expr_invariant_in_loop_p (loop, op0))
3036 add = op0;
3037 off = op1;
3038 do_add:
3039 add = fold_convert (sizetype, add);
3040 if (scale != 1)
3041 add = size_binop (MULT_EXPR, add, size_int (scale));
3042 base = size_binop (PLUS_EXPR, base, add);
3043 continue;
3045 if (expr_invariant_in_loop_p (loop, op1))
3047 add = op1;
3048 off = op0;
3049 goto do_add;
3051 break;
3052 case MINUS_EXPR:
3053 if (expr_invariant_in_loop_p (loop, op1))
3055 add = fold_convert (sizetype, op1);
3056 add = size_binop (MINUS_EXPR, size_zero_node, add);
3057 off = op0;
3058 goto do_add;
3060 break;
3061 case MULT_EXPR:
3062 if (scale == 1 && tree_fits_shwi_p (op1))
3064 scale = tree_to_shwi (op1);
3065 off = op0;
3066 continue;
3068 break;
3069 case SSA_NAME:
3070 off = op0;
3071 continue;
3072 CASE_CONVERT:
3073 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3074 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3075 break;
3076 if (TYPE_PRECISION (TREE_TYPE (op0))
3077 == TYPE_PRECISION (TREE_TYPE (off)))
3079 off = op0;
3080 continue;
3082 if (TYPE_PRECISION (TREE_TYPE (op0))
3083 < TYPE_PRECISION (TREE_TYPE (off)))
3085 off = op0;
3086 offtype = TREE_TYPE (off);
3087 STRIP_NOPS (off);
3088 continue;
3090 break;
3091 default:
3092 break;
3094 break;
3097 /* If at the end OFF still isn't a SSA_NAME or isn't
3098 defined in the loop, punt. */
3099 if (TREE_CODE (off) != SSA_NAME
3100 || expr_invariant_in_loop_p (loop, off))
3101 return NULL_TREE;
3103 if (offtype == NULL_TREE)
3104 offtype = TREE_TYPE (off);
3106 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3107 offtype, scale);
3108 if (decl == NULL_TREE)
3109 return NULL_TREE;
3111 if (basep)
3112 *basep = base;
3113 if (offp)
3114 *offp = off;
3115 if (scalep)
3116 *scalep = scale;
3117 return decl;
3120 /* Function vect_analyze_data_refs.
3122 Find all the data references in the loop or basic block.
3124 The general structure of the analysis of data refs in the vectorizer is as
3125 follows:
3126 1- vect_analyze_data_refs(loop/bb): call
3127 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3128 in the loop/bb and their dependences.
3129 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3130 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3131 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3135 bool
3136 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3137 bb_vec_info bb_vinfo,
3138 int *min_vf)
3140 struct loop *loop = NULL;
3141 basic_block bb = NULL;
3142 unsigned int i;
3143 vec<data_reference_p> datarefs;
3144 struct data_reference *dr;
3145 tree scalar_type;
3147 if (dump_enabled_p ())
3148 dump_printf_loc (MSG_NOTE, vect_location,
3149 "=== vect_analyze_data_refs ===\n");
3151 if (loop_vinfo)
3153 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3155 loop = LOOP_VINFO_LOOP (loop_vinfo);
3156 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3157 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3159 if (dump_enabled_p ())
3160 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3161 "not vectorized: loop contains function calls"
3162 " or data references that cannot be analyzed\n");
3163 return false;
3166 for (i = 0; i < loop->num_nodes; i++)
3168 gimple_stmt_iterator gsi;
3170 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3172 gimple stmt = gsi_stmt (gsi);
3173 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3175 if (is_gimple_call (stmt) && loop->safelen)
3177 tree fndecl = gimple_call_fndecl (stmt), op;
3178 if (fndecl != NULL_TREE)
3180 struct cgraph_node *node = cgraph_get_node (fndecl);
3181 if (node != NULL && node->simd_clones != NULL)
3183 unsigned int j, n = gimple_call_num_args (stmt);
3184 for (j = 0; j < n; j++)
3186 op = gimple_call_arg (stmt, j);
3187 if (DECL_P (op)
3188 || (REFERENCE_CLASS_P (op)
3189 && get_base_address (op)))
3190 break;
3192 op = gimple_call_lhs (stmt);
3193 /* Ignore #pragma omp declare simd functions
3194 if they don't have data references in the
3195 call stmt itself. */
3196 if (j == n
3197 && !(op
3198 && (DECL_P (op)
3199 || (REFERENCE_CLASS_P (op)
3200 && get_base_address (op)))))
3201 continue;
3205 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3206 if (dump_enabled_p ())
3207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3208 "not vectorized: loop contains function "
3209 "calls or data references that cannot "
3210 "be analyzed\n");
3211 return false;
3216 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3218 else
3220 gimple_stmt_iterator gsi;
3222 bb = BB_VINFO_BB (bb_vinfo);
3223 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3225 gimple stmt = gsi_stmt (gsi);
3226 if (!find_data_references_in_stmt (NULL, stmt,
3227 &BB_VINFO_DATAREFS (bb_vinfo)))
3229 /* Mark the rest of the basic-block as unvectorizable. */
3230 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3232 stmt = gsi_stmt (gsi);
3233 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3235 break;
3239 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3242 /* Go through the data-refs, check that the analysis succeeded. Update
3243 pointer from stmt_vec_info struct to DR and vectype. */
3245 FOR_EACH_VEC_ELT (datarefs, i, dr)
3247 gimple stmt;
3248 stmt_vec_info stmt_info;
3249 tree base, offset, init;
3250 bool gather = false;
3251 bool simd_lane_access = false;
3252 int vf;
3254 again:
3255 if (!dr || !DR_REF (dr))
3257 if (dump_enabled_p ())
3258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3259 "not vectorized: unhandled data-ref\n");
3260 return false;
3263 stmt = DR_STMT (dr);
3264 stmt_info = vinfo_for_stmt (stmt);
3266 /* Discard clobbers from the dataref vector. We will remove
3267 clobber stmts during vectorization. */
3268 if (gimple_clobber_p (stmt))
3270 if (i == datarefs.length () - 1)
3272 datarefs.pop ();
3273 break;
3275 datarefs[i] = datarefs.pop ();
3276 goto again;
3279 /* Check that analysis of the data-ref succeeded. */
3280 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3281 || !DR_STEP (dr))
3283 bool maybe_gather
3284 = DR_IS_READ (dr)
3285 && !TREE_THIS_VOLATILE (DR_REF (dr))
3286 && targetm.vectorize.builtin_gather != NULL;
3287 bool maybe_simd_lane_access
3288 = loop_vinfo && loop->simduid;
3290 /* If target supports vector gather loads, or if this might be
3291 a SIMD lane access, see if they can't be used. */
3292 if (loop_vinfo
3293 && (maybe_gather || maybe_simd_lane_access)
3294 && !nested_in_vect_loop_p (loop, stmt))
3296 struct data_reference *newdr
3297 = create_data_ref (NULL, loop_containing_stmt (stmt),
3298 DR_REF (dr), stmt, true);
3299 gcc_assert (newdr != NULL && DR_REF (newdr));
3300 if (DR_BASE_ADDRESS (newdr)
3301 && DR_OFFSET (newdr)
3302 && DR_INIT (newdr)
3303 && DR_STEP (newdr)
3304 && integer_zerop (DR_STEP (newdr)))
3306 if (maybe_simd_lane_access)
3308 tree off = DR_OFFSET (newdr);
3309 STRIP_NOPS (off);
3310 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3311 && TREE_CODE (off) == MULT_EXPR
3312 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3314 tree step = TREE_OPERAND (off, 1);
3315 off = TREE_OPERAND (off, 0);
3316 STRIP_NOPS (off);
3317 if (CONVERT_EXPR_P (off)
3318 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3319 0)))
3320 < TYPE_PRECISION (TREE_TYPE (off)))
3321 off = TREE_OPERAND (off, 0);
3322 if (TREE_CODE (off) == SSA_NAME)
3324 gimple def = SSA_NAME_DEF_STMT (off);
3325 tree reft = TREE_TYPE (DR_REF (newdr));
3326 if (is_gimple_call (def)
3327 && gimple_call_internal_p (def)
3328 && (gimple_call_internal_fn (def)
3329 == IFN_GOMP_SIMD_LANE))
3331 tree arg = gimple_call_arg (def, 0);
3332 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3333 arg = SSA_NAME_VAR (arg);
3334 if (arg == loop->simduid
3335 /* For now. */
3336 && tree_int_cst_equal
3337 (TYPE_SIZE_UNIT (reft),
3338 step))
3340 DR_OFFSET (newdr) = ssize_int (0);
3341 DR_STEP (newdr) = step;
3342 DR_ALIGNED_TO (newdr)
3343 = size_int (BIGGEST_ALIGNMENT);
3344 dr = newdr;
3345 simd_lane_access = true;
3351 if (!simd_lane_access && maybe_gather)
3353 dr = newdr;
3354 gather = true;
3357 if (!gather && !simd_lane_access)
3358 free_data_ref (newdr);
3361 if (!gather && !simd_lane_access)
3363 if (dump_enabled_p ())
3365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3366 "not vectorized: data ref analysis "
3367 "failed ");
3368 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3369 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3372 if (bb_vinfo)
3373 break;
3375 return false;
3379 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3381 if (dump_enabled_p ())
3382 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3383 "not vectorized: base addr of dr is a "
3384 "constant\n");
3386 if (bb_vinfo)
3387 break;
3389 if (gather || simd_lane_access)
3390 free_data_ref (dr);
3391 return false;
3394 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3396 if (dump_enabled_p ())
3398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3399 "not vectorized: volatile type ");
3400 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3401 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3404 if (bb_vinfo)
3405 break;
3407 return false;
3410 if (stmt_can_throw_internal (stmt))
3412 if (dump_enabled_p ())
3414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3415 "not vectorized: statement can throw an "
3416 "exception ");
3417 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3418 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3421 if (bb_vinfo)
3422 break;
3424 if (gather || simd_lane_access)
3425 free_data_ref (dr);
3426 return false;
3429 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3430 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3432 if (dump_enabled_p ())
3434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3435 "not vectorized: statement is bitfield "
3436 "access ");
3437 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3438 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3441 if (bb_vinfo)
3442 break;
3444 if (gather || simd_lane_access)
3445 free_data_ref (dr);
3446 return false;
3449 base = unshare_expr (DR_BASE_ADDRESS (dr));
3450 offset = unshare_expr (DR_OFFSET (dr));
3451 init = unshare_expr (DR_INIT (dr));
3453 if (is_gimple_call (stmt)
3454 && (!gimple_call_internal_p (stmt)
3455 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3456 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3458 if (dump_enabled_p ())
3460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3461 "not vectorized: dr in a call ");
3462 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3463 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3466 if (bb_vinfo)
3467 break;
3469 if (gather || simd_lane_access)
3470 free_data_ref (dr);
3471 return false;
3474 /* Update DR field in stmt_vec_info struct. */
3476 /* If the dataref is in an inner-loop of the loop that is considered for
3477 for vectorization, we also want to analyze the access relative to
3478 the outer-loop (DR contains information only relative to the
3479 inner-most enclosing loop). We do that by building a reference to the
3480 first location accessed by the inner-loop, and analyze it relative to
3481 the outer-loop. */
3482 if (loop && nested_in_vect_loop_p (loop, stmt))
3484 tree outer_step, outer_base, outer_init;
3485 HOST_WIDE_INT pbitsize, pbitpos;
3486 tree poffset;
3487 enum machine_mode pmode;
3488 int punsignedp, pvolatilep;
3489 affine_iv base_iv, offset_iv;
3490 tree dinit;
3492 /* Build a reference to the first location accessed by the
3493 inner-loop: *(BASE+INIT). (The first location is actually
3494 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3495 tree inner_base = build_fold_indirect_ref
3496 (fold_build_pointer_plus (base, init));
3498 if (dump_enabled_p ())
3500 dump_printf_loc (MSG_NOTE, vect_location,
3501 "analyze in outer-loop: ");
3502 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3503 dump_printf (MSG_NOTE, "\n");
3506 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3507 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3508 gcc_assert (outer_base != NULL_TREE);
3510 if (pbitpos % BITS_PER_UNIT != 0)
3512 if (dump_enabled_p ())
3513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3514 "failed: bit offset alignment.\n");
3515 return false;
3518 outer_base = build_fold_addr_expr (outer_base);
3519 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3520 &base_iv, false))
3522 if (dump_enabled_p ())
3523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3524 "failed: evolution of base is not affine.\n");
3525 return false;
3528 if (offset)
3530 if (poffset)
3531 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3532 poffset);
3533 else
3534 poffset = offset;
3537 if (!poffset)
3539 offset_iv.base = ssize_int (0);
3540 offset_iv.step = ssize_int (0);
3542 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3543 &offset_iv, false))
3545 if (dump_enabled_p ())
3546 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3547 "evolution of offset is not affine.\n");
3548 return false;
3551 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3552 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3553 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3554 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3555 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3557 outer_step = size_binop (PLUS_EXPR,
3558 fold_convert (ssizetype, base_iv.step),
3559 fold_convert (ssizetype, offset_iv.step));
3561 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3562 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3563 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3564 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3565 STMT_VINFO_DR_OFFSET (stmt_info) =
3566 fold_convert (ssizetype, offset_iv.base);
3567 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3568 size_int (highest_pow2_factor (offset_iv.base));
3570 if (dump_enabled_p ())
3572 dump_printf_loc (MSG_NOTE, vect_location,
3573 "\touter base_address: ");
3574 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3575 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3576 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3577 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3578 STMT_VINFO_DR_OFFSET (stmt_info));
3579 dump_printf (MSG_NOTE,
3580 "\n\touter constant offset from base address: ");
3581 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3582 STMT_VINFO_DR_INIT (stmt_info));
3583 dump_printf (MSG_NOTE, "\n\touter step: ");
3584 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3585 STMT_VINFO_DR_STEP (stmt_info));
3586 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3587 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3588 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3589 dump_printf (MSG_NOTE, "\n");
3593 if (STMT_VINFO_DATA_REF (stmt_info))
3595 if (dump_enabled_p ())
3597 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3598 "not vectorized: more than one data ref "
3599 "in stmt: ");
3600 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3601 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3604 if (bb_vinfo)
3605 break;
3607 if (gather || simd_lane_access)
3608 free_data_ref (dr);
3609 return false;
3612 STMT_VINFO_DATA_REF (stmt_info) = dr;
3613 if (simd_lane_access)
3615 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3616 datarefs[i] = dr;
3619 /* Set vectype for STMT. */
3620 scalar_type = TREE_TYPE (DR_REF (dr));
3621 STMT_VINFO_VECTYPE (stmt_info) =
3622 get_vectype_for_scalar_type (scalar_type);
3623 if (!STMT_VINFO_VECTYPE (stmt_info))
3625 if (dump_enabled_p ())
3627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3628 "not vectorized: no vectype for stmt: ");
3629 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3630 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3631 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3632 scalar_type);
3633 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3636 if (bb_vinfo)
3637 break;
3639 if (gather || simd_lane_access)
3641 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3642 free_data_ref (dr);
3644 return false;
3646 else
3648 if (dump_enabled_p ())
3650 dump_printf_loc (MSG_NOTE, vect_location,
3651 "got vectype for stmt: ");
3652 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3653 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3654 STMT_VINFO_VECTYPE (stmt_info));
3655 dump_printf (MSG_NOTE, "\n");
3659 /* Adjust the minimal vectorization factor according to the
3660 vector type. */
3661 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3662 if (vf > *min_vf)
3663 *min_vf = vf;
3665 if (gather)
3667 tree off;
3669 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3670 if (gather
3671 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3672 gather = false;
3673 if (!gather)
3675 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3676 free_data_ref (dr);
3677 if (dump_enabled_p ())
3679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3680 "not vectorized: not suitable for gather "
3681 "load ");
3682 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3683 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3685 return false;
3688 datarefs[i] = dr;
3689 STMT_VINFO_GATHER_P (stmt_info) = true;
3691 else if (loop_vinfo
3692 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3694 if (nested_in_vect_loop_p (loop, stmt)
3695 || !DR_IS_READ (dr))
3697 if (dump_enabled_p ())
3699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3700 "not vectorized: not suitable for strided "
3701 "load ");
3702 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3703 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3705 return false;
3707 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3711 /* If we stopped analysis at the first dataref we could not analyze
3712 when trying to vectorize a basic-block mark the rest of the datarefs
3713 as not vectorizable and truncate the vector of datarefs. That
3714 avoids spending useless time in analyzing their dependence. */
3715 if (i != datarefs.length ())
3717 gcc_assert (bb_vinfo != NULL);
3718 for (unsigned j = i; j < datarefs.length (); ++j)
3720 data_reference_p dr = datarefs[j];
3721 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3722 free_data_ref (dr);
3724 datarefs.truncate (i);
3727 return true;
3731 /* Function vect_get_new_vect_var.
3733 Returns a name for a new variable. The current naming scheme appends the
3734 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3735 the name of vectorizer generated variables, and appends that to NAME if
3736 provided. */
3738 tree
3739 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3741 const char *prefix;
3742 tree new_vect_var;
3744 switch (var_kind)
3746 case vect_simple_var:
3747 prefix = "vect";
3748 break;
3749 case vect_scalar_var:
3750 prefix = "stmp";
3751 break;
3752 case vect_pointer_var:
3753 prefix = "vectp";
3754 break;
3755 default:
3756 gcc_unreachable ();
3759 if (name)
3761 char* tmp = concat (prefix, "_", name, NULL);
3762 new_vect_var = create_tmp_reg (type, tmp);
3763 free (tmp);
3765 else
3766 new_vect_var = create_tmp_reg (type, prefix);
3768 return new_vect_var;
3772 /* Function vect_create_addr_base_for_vector_ref.
3774 Create an expression that computes the address of the first memory location
3775 that will be accessed for a data reference.
3777 Input:
3778 STMT: The statement containing the data reference.
3779 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3780 OFFSET: Optional. If supplied, it is be added to the initial address.
3781 LOOP: Specify relative to which loop-nest should the address be computed.
3782 For example, when the dataref is in an inner-loop nested in an
3783 outer-loop that is now being vectorized, LOOP can be either the
3784 outer-loop, or the inner-loop. The first memory location accessed
3785 by the following dataref ('in' points to short):
3787 for (i=0; i<N; i++)
3788 for (j=0; j<M; j++)
3789 s += in[i+j]
3791 is as follows:
3792 if LOOP=i_loop: &in (relative to i_loop)
3793 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3795 Output:
3796 1. Return an SSA_NAME whose value is the address of the memory location of
3797 the first vector of the data reference.
3798 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3799 these statement(s) which define the returned SSA_NAME.
3801 FORNOW: We are only handling array accesses with step 1. */
3803 tree
3804 vect_create_addr_base_for_vector_ref (gimple stmt,
3805 gimple_seq *new_stmt_list,
3806 tree offset,
3807 struct loop *loop)
3809 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3810 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3811 tree data_ref_base;
3812 const char *base_name;
3813 tree addr_base;
3814 tree dest;
3815 gimple_seq seq = NULL;
3816 tree base_offset;
3817 tree init;
3818 tree vect_ptr_type;
3819 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3820 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3822 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3824 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3826 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3828 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3829 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3830 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3832 else
3834 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3835 base_offset = unshare_expr (DR_OFFSET (dr));
3836 init = unshare_expr (DR_INIT (dr));
3839 if (loop_vinfo)
3840 base_name = get_name (data_ref_base);
3841 else
3843 base_offset = ssize_int (0);
3844 init = ssize_int (0);
3845 base_name = get_name (DR_REF (dr));
3848 /* Create base_offset */
3849 base_offset = size_binop (PLUS_EXPR,
3850 fold_convert (sizetype, base_offset),
3851 fold_convert (sizetype, init));
3853 if (offset)
3855 offset = fold_build2 (MULT_EXPR, sizetype,
3856 fold_convert (sizetype, offset), step);
3857 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3858 base_offset, offset);
3861 /* base + base_offset */
3862 if (loop_vinfo)
3863 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3864 else
3866 addr_base = build1 (ADDR_EXPR,
3867 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3868 unshare_expr (DR_REF (dr)));
3871 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3872 addr_base = fold_convert (vect_ptr_type, addr_base);
3873 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3874 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3875 gimple_seq_add_seq (new_stmt_list, seq);
3877 if (DR_PTR_INFO (dr)
3878 && TREE_CODE (addr_base) == SSA_NAME)
3880 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3881 if (offset)
3882 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3885 if (dump_enabled_p ())
3887 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3888 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3889 dump_printf (MSG_NOTE, "\n");
3892 return addr_base;
3896 /* Function vect_create_data_ref_ptr.
3898 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3899 location accessed in the loop by STMT, along with the def-use update
3900 chain to appropriately advance the pointer through the loop iterations.
3901 Also set aliasing information for the pointer. This pointer is used by
3902 the callers to this function to create a memory reference expression for
3903 vector load/store access.
3905 Input:
3906 1. STMT: a stmt that references memory. Expected to be of the form
3907 GIMPLE_ASSIGN <name, data-ref> or
3908 GIMPLE_ASSIGN <data-ref, name>.
3909 2. AGGR_TYPE: the type of the reference, which should be either a vector
3910 or an array.
3911 3. AT_LOOP: the loop where the vector memref is to be created.
3912 4. OFFSET (optional): an offset to be added to the initial address accessed
3913 by the data-ref in STMT.
3914 5. BSI: location where the new stmts are to be placed if there is no loop
3915 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3916 pointing to the initial address.
3918 Output:
3919 1. Declare a new ptr to vector_type, and have it point to the base of the
3920 data reference (initial addressed accessed by the data reference).
3921 For example, for vector of type V8HI, the following code is generated:
3923 v8hi *ap;
3924 ap = (v8hi *)initial_address;
3926 if OFFSET is not supplied:
3927 initial_address = &a[init];
3928 if OFFSET is supplied:
3929 initial_address = &a[init + OFFSET];
3931 Return the initial_address in INITIAL_ADDRESS.
3933 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3934 update the pointer in each iteration of the loop.
3936 Return the increment stmt that updates the pointer in PTR_INCR.
3938 3. Set INV_P to true if the access pattern of the data reference in the
3939 vectorized loop is invariant. Set it to false otherwise.
3941 4. Return the pointer. */
3943 tree
3944 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3945 tree offset, tree *initial_address,
3946 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3947 bool only_init, bool *inv_p)
3949 const char *base_name;
3950 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3951 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3952 struct loop *loop = NULL;
3953 bool nested_in_vect_loop = false;
3954 struct loop *containing_loop = NULL;
3955 tree aggr_ptr_type;
3956 tree aggr_ptr;
3957 tree new_temp;
3958 gimple vec_stmt;
3959 gimple_seq new_stmt_list = NULL;
3960 edge pe = NULL;
3961 basic_block new_bb;
3962 tree aggr_ptr_init;
3963 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3964 tree aptr;
3965 gimple_stmt_iterator incr_gsi;
3966 bool insert_after;
3967 tree indx_before_incr, indx_after_incr;
3968 gimple incr;
3969 tree step;
3970 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3972 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3973 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3975 if (loop_vinfo)
3977 loop = LOOP_VINFO_LOOP (loop_vinfo);
3978 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3979 containing_loop = (gimple_bb (stmt))->loop_father;
3980 pe = loop_preheader_edge (loop);
3982 else
3984 gcc_assert (bb_vinfo);
3985 only_init = true;
3986 *ptr_incr = NULL;
3989 /* Check the step (evolution) of the load in LOOP, and record
3990 whether it's invariant. */
3991 if (nested_in_vect_loop)
3992 step = STMT_VINFO_DR_STEP (stmt_info);
3993 else
3994 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3996 if (integer_zerop (step))
3997 *inv_p = true;
3998 else
3999 *inv_p = false;
4001 /* Create an expression for the first address accessed by this load
4002 in LOOP. */
4003 base_name = get_name (DR_BASE_ADDRESS (dr));
4005 if (dump_enabled_p ())
4007 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4008 dump_printf_loc (MSG_NOTE, vect_location,
4009 "create %s-pointer variable to type: ",
4010 get_tree_code_name (TREE_CODE (aggr_type)));
4011 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4012 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4013 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4014 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4015 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4016 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4017 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4018 else
4019 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4020 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4021 dump_printf (MSG_NOTE, "\n");
4024 /* (1) Create the new aggregate-pointer variable.
4025 Vector and array types inherit the alias set of their component
4026 type by default so we need to use a ref-all pointer if the data
4027 reference does not conflict with the created aggregated data
4028 reference because it is not addressable. */
4029 bool need_ref_all = false;
4030 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4031 get_alias_set (DR_REF (dr))))
4032 need_ref_all = true;
4033 /* Likewise for any of the data references in the stmt group. */
4034 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4036 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4039 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4040 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4041 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4042 get_alias_set (DR_REF (sdr))))
4044 need_ref_all = true;
4045 break;
4047 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4049 while (orig_stmt);
4051 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4052 need_ref_all);
4053 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4056 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4057 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4058 def-use update cycles for the pointer: one relative to the outer-loop
4059 (LOOP), which is what steps (3) and (4) below do. The other is relative
4060 to the inner-loop (which is the inner-most loop containing the dataref),
4061 and this is done be step (5) below.
4063 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4064 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4065 redundant. Steps (3),(4) create the following:
4067 vp0 = &base_addr;
4068 LOOP: vp1 = phi(vp0,vp2)
4071 vp2 = vp1 + step
4072 goto LOOP
4074 If there is an inner-loop nested in loop, then step (5) will also be
4075 applied, and an additional update in the inner-loop will be created:
4077 vp0 = &base_addr;
4078 LOOP: vp1 = phi(vp0,vp2)
4080 inner: vp3 = phi(vp1,vp4)
4081 vp4 = vp3 + inner_step
4082 if () goto inner
4084 vp2 = vp1 + step
4085 if () goto LOOP */
4087 /* (2) Calculate the initial address of the aggregate-pointer, and set
4088 the aggregate-pointer to point to it before the loop. */
4090 /* Create: (&(base[init_val+offset]) in the loop preheader. */
4092 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4093 offset, loop);
4094 if (new_stmt_list)
4096 if (pe)
4098 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4099 gcc_assert (!new_bb);
4101 else
4102 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4105 *initial_address = new_temp;
4107 /* Create: p = (aggr_type *) initial_base */
4108 if (TREE_CODE (new_temp) != SSA_NAME
4109 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4111 vec_stmt = gimple_build_assign (aggr_ptr,
4112 fold_convert (aggr_ptr_type, new_temp));
4113 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4114 /* Copy the points-to information if it exists. */
4115 if (DR_PTR_INFO (dr))
4116 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4117 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4118 if (pe)
4120 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4121 gcc_assert (!new_bb);
4123 else
4124 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4126 else
4127 aggr_ptr_init = new_temp;
4129 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4130 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4131 inner-loop nested in LOOP (during outer-loop vectorization). */
4133 /* No update in loop is required. */
4134 if (only_init && (!loop_vinfo || at_loop == loop))
4135 aptr = aggr_ptr_init;
4136 else
4138 /* The step of the aggregate pointer is the type size. */
4139 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4140 /* One exception to the above is when the scalar step of the load in
4141 LOOP is zero. In this case the step here is also zero. */
4142 if (*inv_p)
4143 iv_step = size_zero_node;
4144 else if (tree_int_cst_sgn (step) == -1)
4145 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4147 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4149 create_iv (aggr_ptr_init,
4150 fold_convert (aggr_ptr_type, iv_step),
4151 aggr_ptr, loop, &incr_gsi, insert_after,
4152 &indx_before_incr, &indx_after_incr);
4153 incr = gsi_stmt (incr_gsi);
4154 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4156 /* Copy the points-to information if it exists. */
4157 if (DR_PTR_INFO (dr))
4159 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4160 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4162 if (ptr_incr)
4163 *ptr_incr = incr;
4165 aptr = indx_before_incr;
4168 if (!nested_in_vect_loop || only_init)
4169 return aptr;
4172 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4173 nested in LOOP, if exists. */
4175 gcc_assert (nested_in_vect_loop);
4176 if (!only_init)
4178 standard_iv_increment_position (containing_loop, &incr_gsi,
4179 &insert_after);
4180 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4181 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4182 &indx_after_incr);
4183 incr = gsi_stmt (incr_gsi);
4184 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4186 /* Copy the points-to information if it exists. */
4187 if (DR_PTR_INFO (dr))
4189 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4190 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4192 if (ptr_incr)
4193 *ptr_incr = incr;
4195 return indx_before_incr;
4197 else
4198 gcc_unreachable ();
4202 /* Function bump_vector_ptr
4204 Increment a pointer (to a vector type) by vector-size. If requested,
4205 i.e. if PTR-INCR is given, then also connect the new increment stmt
4206 to the existing def-use update-chain of the pointer, by modifying
4207 the PTR_INCR as illustrated below:
4209 The pointer def-use update-chain before this function:
4210 DATAREF_PTR = phi (p_0, p_2)
4211 ....
4212 PTR_INCR: p_2 = DATAREF_PTR + step
4214 The pointer def-use update-chain after this function:
4215 DATAREF_PTR = phi (p_0, p_2)
4216 ....
4217 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4218 ....
4219 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4221 Input:
4222 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4223 in the loop.
4224 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4225 the loop. The increment amount across iterations is expected
4226 to be vector_size.
4227 BSI - location where the new update stmt is to be placed.
4228 STMT - the original scalar memory-access stmt that is being vectorized.
4229 BUMP - optional. The offset by which to bump the pointer. If not given,
4230 the offset is assumed to be vector_size.
4232 Output: Return NEW_DATAREF_PTR as illustrated above.
4236 tree
4237 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4238 gimple stmt, tree bump)
4240 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4241 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4242 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4243 tree update = TYPE_SIZE_UNIT (vectype);
4244 gimple incr_stmt;
4245 ssa_op_iter iter;
4246 use_operand_p use_p;
4247 tree new_dataref_ptr;
4249 if (bump)
4250 update = bump;
4252 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4253 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4254 dataref_ptr, update);
4255 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4257 /* Copy the points-to information if it exists. */
4258 if (DR_PTR_INFO (dr))
4260 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4261 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4264 if (!ptr_incr)
4265 return new_dataref_ptr;
4267 /* Update the vector-pointer's cross-iteration increment. */
4268 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4270 tree use = USE_FROM_PTR (use_p);
4272 if (use == dataref_ptr)
4273 SET_USE (use_p, new_dataref_ptr);
4274 else
4275 gcc_assert (tree_int_cst_compare (use, update) == 0);
4278 return new_dataref_ptr;
4282 /* Function vect_create_destination_var.
4284 Create a new temporary of type VECTYPE. */
4286 tree
4287 vect_create_destination_var (tree scalar_dest, tree vectype)
4289 tree vec_dest;
4290 const char *name;
4291 char *new_name;
4292 tree type;
4293 enum vect_var_kind kind;
4295 kind = vectype ? vect_simple_var : vect_scalar_var;
4296 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4298 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4300 name = get_name (scalar_dest);
4301 if (name)
4302 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4303 else
4304 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
4305 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4306 free (new_name);
4308 return vec_dest;
4311 /* Function vect_grouped_store_supported.
4313 Returns TRUE if interleave high and interleave low permutations
4314 are supported, and FALSE otherwise. */
4316 bool
4317 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4319 enum machine_mode mode = TYPE_MODE (vectype);
4321 /* vect_permute_store_chain requires the group size to be a power of two. */
4322 if (exact_log2 (count) == -1)
4324 if (dump_enabled_p ())
4325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4326 "the size of the group of accesses"
4327 " is not a power of 2\n");
4328 return false;
4331 /* Check that the permutation is supported. */
4332 if (VECTOR_MODE_P (mode))
4334 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4335 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4336 for (i = 0; i < nelt / 2; i++)
4338 sel[i * 2] = i;
4339 sel[i * 2 + 1] = i + nelt;
4341 if (can_vec_perm_p (mode, false, sel))
4343 for (i = 0; i < nelt; i++)
4344 sel[i] += nelt / 2;
4345 if (can_vec_perm_p (mode, false, sel))
4346 return true;
4350 if (dump_enabled_p ())
4351 dump_printf (MSG_MISSED_OPTIMIZATION,
4352 "interleave op not supported by target.\n");
4353 return false;
4357 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4358 type VECTYPE. */
4360 bool
4361 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4363 return vect_lanes_optab_supported_p ("vec_store_lanes",
4364 vec_store_lanes_optab,
4365 vectype, count);
4369 /* Function vect_permute_store_chain.
4371 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4372 a power of 2, generate interleave_high/low stmts to reorder the data
4373 correctly for the stores. Return the final references for stores in
4374 RESULT_CHAIN.
4376 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4377 The input is 4 vectors each containing 8 elements. We assign a number to
4378 each element, the input sequence is:
4380 1st vec: 0 1 2 3 4 5 6 7
4381 2nd vec: 8 9 10 11 12 13 14 15
4382 3rd vec: 16 17 18 19 20 21 22 23
4383 4th vec: 24 25 26 27 28 29 30 31
4385 The output sequence should be:
4387 1st vec: 0 8 16 24 1 9 17 25
4388 2nd vec: 2 10 18 26 3 11 19 27
4389 3rd vec: 4 12 20 28 5 13 21 30
4390 4th vec: 6 14 22 30 7 15 23 31
4392 i.e., we interleave the contents of the four vectors in their order.
4394 We use interleave_high/low instructions to create such output. The input of
4395 each interleave_high/low operation is two vectors:
4396 1st vec 2nd vec
4397 0 1 2 3 4 5 6 7
4398 the even elements of the result vector are obtained left-to-right from the
4399 high/low elements of the first vector. The odd elements of the result are
4400 obtained left-to-right from the high/low elements of the second vector.
4401 The output of interleave_high will be: 0 4 1 5
4402 and of interleave_low: 2 6 3 7
4405 The permutation is done in log LENGTH stages. In each stage interleave_high
4406 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4407 where the first argument is taken from the first half of DR_CHAIN and the
4408 second argument from it's second half.
4409 In our example,
4411 I1: interleave_high (1st vec, 3rd vec)
4412 I2: interleave_low (1st vec, 3rd vec)
4413 I3: interleave_high (2nd vec, 4th vec)
4414 I4: interleave_low (2nd vec, 4th vec)
4416 The output for the first stage is:
4418 I1: 0 16 1 17 2 18 3 19
4419 I2: 4 20 5 21 6 22 7 23
4420 I3: 8 24 9 25 10 26 11 27
4421 I4: 12 28 13 29 14 30 15 31
4423 The output of the second stage, i.e. the final result is:
4425 I1: 0 8 16 24 1 9 17 25
4426 I2: 2 10 18 26 3 11 19 27
4427 I3: 4 12 20 28 5 13 21 30
4428 I4: 6 14 22 30 7 15 23 31. */
4430 void
4431 vect_permute_store_chain (vec<tree> dr_chain,
4432 unsigned int length,
4433 gimple stmt,
4434 gimple_stmt_iterator *gsi,
4435 vec<tree> *result_chain)
4437 tree vect1, vect2, high, low;
4438 gimple perm_stmt;
4439 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4440 tree perm_mask_low, perm_mask_high;
4441 unsigned int i, n;
4442 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4443 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4445 result_chain->quick_grow (length);
4446 memcpy (result_chain->address (), dr_chain.address (),
4447 length * sizeof (tree));
4449 for (i = 0, n = nelt / 2; i < n; i++)
4451 sel[i * 2] = i;
4452 sel[i * 2 + 1] = i + nelt;
4454 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4455 gcc_assert (perm_mask_high != NULL);
4457 for (i = 0; i < nelt; i++)
4458 sel[i] += nelt / 2;
4459 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4460 gcc_assert (perm_mask_low != NULL);
4462 for (i = 0, n = exact_log2 (length); i < n; i++)
4464 for (j = 0; j < length/2; j++)
4466 vect1 = dr_chain[j];
4467 vect2 = dr_chain[j+length/2];
4469 /* Create interleaving stmt:
4470 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4471 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4472 perm_stmt
4473 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4474 vect1, vect2, perm_mask_high);
4475 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4476 (*result_chain)[2*j] = high;
4478 /* Create interleaving stmt:
4479 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4480 nelt*3/2+1, ...}> */
4481 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4482 perm_stmt
4483 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4484 vect1, vect2, perm_mask_low);
4485 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4486 (*result_chain)[2*j+1] = low;
4488 memcpy (dr_chain.address (), result_chain->address (),
4489 length * sizeof (tree));
4493 /* Function vect_setup_realignment
4495 This function is called when vectorizing an unaligned load using
4496 the dr_explicit_realign[_optimized] scheme.
4497 This function generates the following code at the loop prolog:
4499 p = initial_addr;
4500 x msq_init = *(floor(p)); # prolog load
4501 realignment_token = call target_builtin;
4502 loop:
4503 x msq = phi (msq_init, ---)
4505 The stmts marked with x are generated only for the case of
4506 dr_explicit_realign_optimized.
4508 The code above sets up a new (vector) pointer, pointing to the first
4509 location accessed by STMT, and a "floor-aligned" load using that pointer.
4510 It also generates code to compute the "realignment-token" (if the relevant
4511 target hook was defined), and creates a phi-node at the loop-header bb
4512 whose arguments are the result of the prolog-load (created by this
4513 function) and the result of a load that takes place in the loop (to be
4514 created by the caller to this function).
4516 For the case of dr_explicit_realign_optimized:
4517 The caller to this function uses the phi-result (msq) to create the
4518 realignment code inside the loop, and sets up the missing phi argument,
4519 as follows:
4520 loop:
4521 msq = phi (msq_init, lsq)
4522 lsq = *(floor(p')); # load in loop
4523 result = realign_load (msq, lsq, realignment_token);
4525 For the case of dr_explicit_realign:
4526 loop:
4527 msq = *(floor(p)); # load in loop
4528 p' = p + (VS-1);
4529 lsq = *(floor(p')); # load in loop
4530 result = realign_load (msq, lsq, realignment_token);
4532 Input:
4533 STMT - (scalar) load stmt to be vectorized. This load accesses
4534 a memory location that may be unaligned.
4535 BSI - place where new code is to be inserted.
4536 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4537 is used.
4539 Output:
4540 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4541 target hook, if defined.
4542 Return value - the result of the loop-header phi node. */
4544 tree
4545 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4546 tree *realignment_token,
4547 enum dr_alignment_support alignment_support_scheme,
4548 tree init_addr,
4549 struct loop **at_loop)
4551 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4552 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4553 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4554 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4555 struct loop *loop = NULL;
4556 edge pe = NULL;
4557 tree scalar_dest = gimple_assign_lhs (stmt);
4558 tree vec_dest;
4559 gimple inc;
4560 tree ptr;
4561 tree data_ref;
4562 gimple new_stmt;
4563 basic_block new_bb;
4564 tree msq_init = NULL_TREE;
4565 tree new_temp;
4566 gimple phi_stmt;
4567 tree msq = NULL_TREE;
4568 gimple_seq stmts = NULL;
4569 bool inv_p;
4570 bool compute_in_loop = false;
4571 bool nested_in_vect_loop = false;
4572 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4573 struct loop *loop_for_initial_load = NULL;
4575 if (loop_vinfo)
4577 loop = LOOP_VINFO_LOOP (loop_vinfo);
4578 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4581 gcc_assert (alignment_support_scheme == dr_explicit_realign
4582 || alignment_support_scheme == dr_explicit_realign_optimized);
4584 /* We need to generate three things:
4585 1. the misalignment computation
4586 2. the extra vector load (for the optimized realignment scheme).
4587 3. the phi node for the two vectors from which the realignment is
4588 done (for the optimized realignment scheme). */
4590 /* 1. Determine where to generate the misalignment computation.
4592 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4593 calculation will be generated by this function, outside the loop (in the
4594 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4595 caller, inside the loop.
4597 Background: If the misalignment remains fixed throughout the iterations of
4598 the loop, then both realignment schemes are applicable, and also the
4599 misalignment computation can be done outside LOOP. This is because we are
4600 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4601 are a multiple of VS (the Vector Size), and therefore the misalignment in
4602 different vectorized LOOP iterations is always the same.
4603 The problem arises only if the memory access is in an inner-loop nested
4604 inside LOOP, which is now being vectorized using outer-loop vectorization.
4605 This is the only case when the misalignment of the memory access may not
4606 remain fixed throughout the iterations of the inner-loop (as explained in
4607 detail in vect_supportable_dr_alignment). In this case, not only is the
4608 optimized realignment scheme not applicable, but also the misalignment
4609 computation (and generation of the realignment token that is passed to
4610 REALIGN_LOAD) have to be done inside the loop.
4612 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4613 or not, which in turn determines if the misalignment is computed inside
4614 the inner-loop, or outside LOOP. */
4616 if (init_addr != NULL_TREE || !loop_vinfo)
4618 compute_in_loop = true;
4619 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4623 /* 2. Determine where to generate the extra vector load.
4625 For the optimized realignment scheme, instead of generating two vector
4626 loads in each iteration, we generate a single extra vector load in the
4627 preheader of the loop, and in each iteration reuse the result of the
4628 vector load from the previous iteration. In case the memory access is in
4629 an inner-loop nested inside LOOP, which is now being vectorized using
4630 outer-loop vectorization, we need to determine whether this initial vector
4631 load should be generated at the preheader of the inner-loop, or can be
4632 generated at the preheader of LOOP. If the memory access has no evolution
4633 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4634 to be generated inside LOOP (in the preheader of the inner-loop). */
4636 if (nested_in_vect_loop)
4638 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4639 bool invariant_in_outerloop =
4640 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4641 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4643 else
4644 loop_for_initial_load = loop;
4645 if (at_loop)
4646 *at_loop = loop_for_initial_load;
4648 if (loop_for_initial_load)
4649 pe = loop_preheader_edge (loop_for_initial_load);
4651 /* 3. For the case of the optimized realignment, create the first vector
4652 load at the loop preheader. */
4654 if (alignment_support_scheme == dr_explicit_realign_optimized)
4656 /* Create msq_init = *(floor(p1)) in the loop preheader */
4658 gcc_assert (!compute_in_loop);
4659 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4660 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4661 NULL_TREE, &init_addr, NULL, &inc,
4662 true, &inv_p);
4663 new_temp = copy_ssa_name (ptr, NULL);
4664 new_stmt = gimple_build_assign_with_ops
4665 (BIT_AND_EXPR, new_temp, ptr,
4666 build_int_cst (TREE_TYPE (ptr),
4667 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4668 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4669 gcc_assert (!new_bb);
4670 data_ref
4671 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4672 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4673 new_stmt = gimple_build_assign (vec_dest, data_ref);
4674 new_temp = make_ssa_name (vec_dest, new_stmt);
4675 gimple_assign_set_lhs (new_stmt, new_temp);
4676 if (pe)
4678 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4679 gcc_assert (!new_bb);
4681 else
4682 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4684 msq_init = gimple_assign_lhs (new_stmt);
4687 /* 4. Create realignment token using a target builtin, if available.
4688 It is done either inside the containing loop, or before LOOP (as
4689 determined above). */
4691 if (targetm.vectorize.builtin_mask_for_load)
4693 tree builtin_decl;
4695 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4696 if (!init_addr)
4698 /* Generate the INIT_ADDR computation outside LOOP. */
4699 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4700 NULL_TREE, loop);
4701 if (loop)
4703 pe = loop_preheader_edge (loop);
4704 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4705 gcc_assert (!new_bb);
4707 else
4708 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4711 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4712 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4713 vec_dest =
4714 vect_create_destination_var (scalar_dest,
4715 gimple_call_return_type (new_stmt));
4716 new_temp = make_ssa_name (vec_dest, new_stmt);
4717 gimple_call_set_lhs (new_stmt, new_temp);
4719 if (compute_in_loop)
4720 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4721 else
4723 /* Generate the misalignment computation outside LOOP. */
4724 pe = loop_preheader_edge (loop);
4725 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4726 gcc_assert (!new_bb);
4729 *realignment_token = gimple_call_lhs (new_stmt);
4731 /* The result of the CALL_EXPR to this builtin is determined from
4732 the value of the parameter and no global variables are touched
4733 which makes the builtin a "const" function. Requiring the
4734 builtin to have the "const" attribute makes it unnecessary
4735 to call mark_call_clobbered. */
4736 gcc_assert (TREE_READONLY (builtin_decl));
4739 if (alignment_support_scheme == dr_explicit_realign)
4740 return msq;
4742 gcc_assert (!compute_in_loop);
4743 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4746 /* 5. Create msq = phi <msq_init, lsq> in loop */
4748 pe = loop_preheader_edge (containing_loop);
4749 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4750 msq = make_ssa_name (vec_dest, NULL);
4751 phi_stmt = create_phi_node (msq, containing_loop->header);
4752 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4754 return msq;
4758 /* Function vect_grouped_load_supported.
4760 Returns TRUE if even and odd permutations are supported,
4761 and FALSE otherwise. */
4763 bool
4764 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4766 enum machine_mode mode = TYPE_MODE (vectype);
4768 /* vect_permute_load_chain requires the group size to be a power of two. */
4769 if (exact_log2 (count) == -1)
4771 if (dump_enabled_p ())
4772 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4773 "the size of the group of accesses"
4774 " is not a power of 2\n");
4775 return false;
4778 /* Check that the permutation is supported. */
4779 if (VECTOR_MODE_P (mode))
4781 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4782 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4784 for (i = 0; i < nelt; i++)
4785 sel[i] = i * 2;
4786 if (can_vec_perm_p (mode, false, sel))
4788 for (i = 0; i < nelt; i++)
4789 sel[i] = i * 2 + 1;
4790 if (can_vec_perm_p (mode, false, sel))
4791 return true;
4795 if (dump_enabled_p ())
4796 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4797 "extract even/odd not supported by target\n");
4798 return false;
4801 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4802 type VECTYPE. */
4804 bool
4805 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4807 return vect_lanes_optab_supported_p ("vec_load_lanes",
4808 vec_load_lanes_optab,
4809 vectype, count);
4812 /* Function vect_permute_load_chain.
4814 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4815 a power of 2, generate extract_even/odd stmts to reorder the input data
4816 correctly. Return the final references for loads in RESULT_CHAIN.
4818 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4819 The input is 4 vectors each containing 8 elements. We assign a number to each
4820 element, the input sequence is:
4822 1st vec: 0 1 2 3 4 5 6 7
4823 2nd vec: 8 9 10 11 12 13 14 15
4824 3rd vec: 16 17 18 19 20 21 22 23
4825 4th vec: 24 25 26 27 28 29 30 31
4827 The output sequence should be:
4829 1st vec: 0 4 8 12 16 20 24 28
4830 2nd vec: 1 5 9 13 17 21 25 29
4831 3rd vec: 2 6 10 14 18 22 26 30
4832 4th vec: 3 7 11 15 19 23 27 31
4834 i.e., the first output vector should contain the first elements of each
4835 interleaving group, etc.
4837 We use extract_even/odd instructions to create such output. The input of
4838 each extract_even/odd operation is two vectors
4839 1st vec 2nd vec
4840 0 1 2 3 4 5 6 7
4842 and the output is the vector of extracted even/odd elements. The output of
4843 extract_even will be: 0 2 4 6
4844 and of extract_odd: 1 3 5 7
4847 The permutation is done in log LENGTH stages. In each stage extract_even
4848 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4849 their order. In our example,
4851 E1: extract_even (1st vec, 2nd vec)
4852 E2: extract_odd (1st vec, 2nd vec)
4853 E3: extract_even (3rd vec, 4th vec)
4854 E4: extract_odd (3rd vec, 4th vec)
4856 The output for the first stage will be:
4858 E1: 0 2 4 6 8 10 12 14
4859 E2: 1 3 5 7 9 11 13 15
4860 E3: 16 18 20 22 24 26 28 30
4861 E4: 17 19 21 23 25 27 29 31
4863 In order to proceed and create the correct sequence for the next stage (or
4864 for the correct output, if the second stage is the last one, as in our
4865 example), we first put the output of extract_even operation and then the
4866 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4867 The input for the second stage is:
4869 1st vec (E1): 0 2 4 6 8 10 12 14
4870 2nd vec (E3): 16 18 20 22 24 26 28 30
4871 3rd vec (E2): 1 3 5 7 9 11 13 15
4872 4th vec (E4): 17 19 21 23 25 27 29 31
4874 The output of the second stage:
4876 E1: 0 4 8 12 16 20 24 28
4877 E2: 2 6 10 14 18 22 26 30
4878 E3: 1 5 9 13 17 21 25 29
4879 E4: 3 7 11 15 19 23 27 31
4881 And RESULT_CHAIN after reordering:
4883 1st vec (E1): 0 4 8 12 16 20 24 28
4884 2nd vec (E3): 1 5 9 13 17 21 25 29
4885 3rd vec (E2): 2 6 10 14 18 22 26 30
4886 4th vec (E4): 3 7 11 15 19 23 27 31. */
4888 static void
4889 vect_permute_load_chain (vec<tree> dr_chain,
4890 unsigned int length,
4891 gimple stmt,
4892 gimple_stmt_iterator *gsi,
4893 vec<tree> *result_chain)
4895 tree data_ref, first_vect, second_vect;
4896 tree perm_mask_even, perm_mask_odd;
4897 gimple perm_stmt;
4898 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4899 unsigned int i, j, log_length = exact_log2 (length);
4900 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4901 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4903 result_chain->quick_grow (length);
4904 memcpy (result_chain->address (), dr_chain.address (),
4905 length * sizeof (tree));
4907 for (i = 0; i < nelt; ++i)
4908 sel[i] = i * 2;
4909 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4910 gcc_assert (perm_mask_even != NULL);
4912 for (i = 0; i < nelt; ++i)
4913 sel[i] = i * 2 + 1;
4914 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4915 gcc_assert (perm_mask_odd != NULL);
4917 for (i = 0; i < log_length; i++)
4919 for (j = 0; j < length; j += 2)
4921 first_vect = dr_chain[j];
4922 second_vect = dr_chain[j+1];
4924 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4925 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4926 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4927 first_vect, second_vect,
4928 perm_mask_even);
4929 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4930 (*result_chain)[j/2] = data_ref;
4932 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4933 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4934 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4935 first_vect, second_vect,
4936 perm_mask_odd);
4937 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4938 (*result_chain)[j/2+length/2] = data_ref;
4940 memcpy (dr_chain.address (), result_chain->address (),
4941 length * sizeof (tree));
4946 /* Function vect_transform_grouped_load.
4948 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4949 to perform their permutation and ascribe the result vectorized statements to
4950 the scalar statements.
4953 void
4954 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4955 gimple_stmt_iterator *gsi)
4957 vec<tree> result_chain = vNULL;
4959 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4960 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4961 vectors, that are ready for vector computation. */
4962 result_chain.create (size);
4963 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4964 vect_record_grouped_load_vectors (stmt, result_chain);
4965 result_chain.release ();
4968 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4969 generated as part of the vectorization of STMT. Assign the statement
4970 for each vector to the associated scalar statement. */
4972 void
4973 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4975 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4976 gimple next_stmt, new_stmt;
4977 unsigned int i, gap_count;
4978 tree tmp_data_ref;
4980 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4981 Since we scan the chain starting from it's first node, their order
4982 corresponds the order of data-refs in RESULT_CHAIN. */
4983 next_stmt = first_stmt;
4984 gap_count = 1;
4985 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
4987 if (!next_stmt)
4988 break;
4990 /* Skip the gaps. Loads created for the gaps will be removed by dead
4991 code elimination pass later. No need to check for the first stmt in
4992 the group, since it always exists.
4993 GROUP_GAP is the number of steps in elements from the previous
4994 access (if there is no gap GROUP_GAP is 1). We skip loads that
4995 correspond to the gaps. */
4996 if (next_stmt != first_stmt
4997 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4999 gap_count++;
5000 continue;
5003 while (next_stmt)
5005 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5006 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5007 copies, and we put the new vector statement in the first available
5008 RELATED_STMT. */
5009 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5010 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5011 else
5013 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5015 gimple prev_stmt =
5016 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5017 gimple rel_stmt =
5018 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5019 while (rel_stmt)
5021 prev_stmt = rel_stmt;
5022 rel_stmt =
5023 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5026 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5027 new_stmt;
5031 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5032 gap_count = 1;
5033 /* If NEXT_STMT accesses the same DR as the previous statement,
5034 put the same TMP_DATA_REF as its vectorized statement; otherwise
5035 get the next data-ref from RESULT_CHAIN. */
5036 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5037 break;
5042 /* Function vect_force_dr_alignment_p.
5044 Returns whether the alignment of a DECL can be forced to be aligned
5045 on ALIGNMENT bit boundary. */
5047 bool
5048 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5050 if (TREE_CODE (decl) != VAR_DECL)
5051 return false;
5053 /* We cannot change alignment of common or external symbols as another
5054 translation unit may contain a definition with lower alignment.
5055 The rules of common symbol linking mean that the definition
5056 will override the common symbol. The same is true for constant
5057 pool entries which may be shared and are not properly merged
5058 by LTO. */
5059 if (DECL_EXTERNAL (decl)
5060 || DECL_COMMON (decl)
5061 || DECL_IN_CONSTANT_POOL (decl))
5062 return false;
5064 if (TREE_ASM_WRITTEN (decl))
5065 return false;
5067 /* Do not override the alignment as specified by the ABI when the used
5068 attribute is set. */
5069 if (DECL_PRESERVE_P (decl))
5070 return false;
5072 /* Do not override explicit alignment set by the user when an explicit
5073 section name is also used. This is a common idiom used by many
5074 software projects. */
5075 if (DECL_SECTION_NAME (decl) != NULL_TREE
5076 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
5077 return false;
5079 if (TREE_STATIC (decl))
5080 return (alignment <= MAX_OFILE_ALIGNMENT);
5081 else
5082 return (alignment <= MAX_STACK_ALIGNMENT);
5086 /* Return whether the data reference DR is supported with respect to its
5087 alignment.
5088 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5089 it is aligned, i.e., check if it is possible to vectorize it with different
5090 alignment. */
5092 enum dr_alignment_support
5093 vect_supportable_dr_alignment (struct data_reference *dr,
5094 bool check_aligned_accesses)
5096 gimple stmt = DR_STMT (dr);
5097 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5098 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5099 enum machine_mode mode = TYPE_MODE (vectype);
5100 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5101 struct loop *vect_loop = NULL;
5102 bool nested_in_vect_loop = false;
5104 if (aligned_access_p (dr) && !check_aligned_accesses)
5105 return dr_aligned;
5107 /* For now assume all conditional loads/stores support unaligned
5108 access without any special code. */
5109 if (is_gimple_call (stmt)
5110 && gimple_call_internal_p (stmt)
5111 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5112 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5113 return dr_unaligned_supported;
5115 if (loop_vinfo)
5117 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5118 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5121 /* Possibly unaligned access. */
5123 /* We can choose between using the implicit realignment scheme (generating
5124 a misaligned_move stmt) and the explicit realignment scheme (generating
5125 aligned loads with a REALIGN_LOAD). There are two variants to the
5126 explicit realignment scheme: optimized, and unoptimized.
5127 We can optimize the realignment only if the step between consecutive
5128 vector loads is equal to the vector size. Since the vector memory
5129 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5130 is guaranteed that the misalignment amount remains the same throughout the
5131 execution of the vectorized loop. Therefore, we can create the
5132 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5133 at the loop preheader.
5135 However, in the case of outer-loop vectorization, when vectorizing a
5136 memory access in the inner-loop nested within the LOOP that is now being
5137 vectorized, while it is guaranteed that the misalignment of the
5138 vectorized memory access will remain the same in different outer-loop
5139 iterations, it is *not* guaranteed that is will remain the same throughout
5140 the execution of the inner-loop. This is because the inner-loop advances
5141 with the original scalar step (and not in steps of VS). If the inner-loop
5142 step happens to be a multiple of VS, then the misalignment remains fixed
5143 and we can use the optimized realignment scheme. For example:
5145 for (i=0; i<N; i++)
5146 for (j=0; j<M; j++)
5147 s += a[i+j];
5149 When vectorizing the i-loop in the above example, the step between
5150 consecutive vector loads is 1, and so the misalignment does not remain
5151 fixed across the execution of the inner-loop, and the realignment cannot
5152 be optimized (as illustrated in the following pseudo vectorized loop):
5154 for (i=0; i<N; i+=4)
5155 for (j=0; j<M; j++){
5156 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5157 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5158 // (assuming that we start from an aligned address).
5161 We therefore have to use the unoptimized realignment scheme:
5163 for (i=0; i<N; i+=4)
5164 for (j=k; j<M; j+=4)
5165 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5166 // that the misalignment of the initial address is
5167 // 0).
5169 The loop can then be vectorized as follows:
5171 for (k=0; k<4; k++){
5172 rt = get_realignment_token (&vp[k]);
5173 for (i=0; i<N; i+=4){
5174 v1 = vp[i+k];
5175 for (j=k; j<M; j+=4){
5176 v2 = vp[i+j+VS-1];
5177 va = REALIGN_LOAD <v1,v2,rt>;
5178 vs += va;
5179 v1 = v2;
5182 } */
5184 if (DR_IS_READ (dr))
5186 bool is_packed = false;
5187 tree type = (TREE_TYPE (DR_REF (dr)));
5189 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5190 && (!targetm.vectorize.builtin_mask_for_load
5191 || targetm.vectorize.builtin_mask_for_load ()))
5193 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5194 if ((nested_in_vect_loop
5195 && (TREE_INT_CST_LOW (DR_STEP (dr))
5196 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5197 || !loop_vinfo)
5198 return dr_explicit_realign;
5199 else
5200 return dr_explicit_realign_optimized;
5202 if (!known_alignment_for_access_p (dr))
5203 is_packed = not_size_aligned (DR_REF (dr));
5205 if ((TYPE_USER_ALIGN (type) && !is_packed)
5206 || targetm.vectorize.
5207 support_vector_misalignment (mode, type,
5208 DR_MISALIGNMENT (dr), is_packed))
5209 /* Can't software pipeline the loads, but can at least do them. */
5210 return dr_unaligned_supported;
5212 else
5214 bool is_packed = false;
5215 tree type = (TREE_TYPE (DR_REF (dr)));
5217 if (!known_alignment_for_access_p (dr))
5218 is_packed = not_size_aligned (DR_REF (dr));
5220 if ((TYPE_USER_ALIGN (type) && !is_packed)
5221 || targetm.vectorize.
5222 support_vector_misalignment (mode, type,
5223 DR_MISALIGNMENT (dr), is_packed))
5224 return dr_unaligned_supported;
5227 /* Unsupported. */
5228 return dr_unaligned_unsupported;