* gcc.dg/c11-complex-1.c: Use dg-add-options ieee.
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob83d1f4546bdbfd09aca6bee636c6913217bba6a1
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "gimple.h"
34 #include "gimplify.h"
35 #include "gimple-iterator.h"
36 #include "gimplify-me.h"
37 #include "gimple-ssa.h"
38 #include "tree-phinodes.h"
39 #include "ssa-iterators.h"
40 #include "tree-ssanames.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop.h"
44 #include "dumpfile.h"
45 #include "cfgloop.h"
46 #include "tree-chrec.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "diagnostic-core.h"
50 /* Need to include rtl.h, expr.h, etc. for optabs. */
51 #include "expr.h"
52 #include "optabs.h"
54 /* Return true if load- or store-lanes optab OPTAB is implemented for
55 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
57 static bool
58 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
59 tree vectype, unsigned HOST_WIDE_INT count)
61 enum machine_mode mode, array_mode;
62 bool limit_p;
64 mode = TYPE_MODE (vectype);
65 limit_p = !targetm.array_mode_supported_p (mode, count);
66 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
67 MODE_INT, limit_p);
69 if (array_mode == BLKmode)
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
74 GET_MODE_NAME (mode), count);
75 return false;
78 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
80 if (dump_enabled_p ())
81 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
82 "cannot use %s<%s><%s>\n", name,
83 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
84 return false;
87 if (dump_enabled_p ())
88 dump_printf_loc (MSG_NOTE, vect_location,
89 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
90 GET_MODE_NAME (mode));
92 return true;
96 /* Return the smallest scalar part of STMT.
97 This is used to determine the vectype of the stmt. We generally set the
98 vectype according to the type of the result (lhs). For stmts whose
99 result-type is different than the type of the arguments (e.g., demotion,
100 promotion), vectype will be reset appropriately (later). Note that we have
101 to visit the smallest datatype in this function, because that determines the
102 VF. If the smallest datatype in the loop is present only as the rhs of a
103 promotion operation - we'd miss it.
104 Such a case, where a variable of this datatype does not appear in the lhs
105 anywhere in the loop, can only occur if it's an invariant: e.g.:
106 'int_x = (int) short_inv', which we'd expect to have been optimized away by
107 invariant motion. However, we cannot rely on invariant motion to always
108 take invariants out of the loop, and so in the case of promotion we also
109 have to check the rhs.
110 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
111 types. */
113 tree
114 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
115 HOST_WIDE_INT *rhs_size_unit)
117 tree scalar_type = gimple_expr_type (stmt);
118 HOST_WIDE_INT lhs, rhs;
120 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
122 if (is_gimple_assign (stmt)
123 && (gimple_assign_cast_p (stmt)
124 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
125 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
126 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
128 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
130 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
131 if (rhs < lhs)
132 scalar_type = rhs_type;
135 *lhs_size_unit = lhs;
136 *rhs_size_unit = rhs;
137 return scalar_type;
141 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
142 tested at run-time. Return TRUE if DDR was successfully inserted.
143 Return false if versioning is not supported. */
145 static bool
146 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
148 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
150 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
151 return false;
153 if (dump_enabled_p ())
155 dump_printf_loc (MSG_NOTE, vect_location,
156 "mark for run-time aliasing test between ");
157 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
158 dump_printf (MSG_NOTE, " and ");
159 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
160 dump_printf (MSG_NOTE, "\n");
163 if (optimize_loop_nest_for_size_p (loop))
165 if (dump_enabled_p ())
166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
167 "versioning not supported when optimizing"
168 " for size.\n");
169 return false;
172 /* FORNOW: We don't support versioning with outer-loop vectorization. */
173 if (loop->inner)
175 if (dump_enabled_p ())
176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
177 "versioning not yet supported for outer-loops.\n");
178 return false;
181 /* FORNOW: We don't support creating runtime alias tests for non-constant
182 step. */
183 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
184 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
186 if (dump_enabled_p ())
187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
188 "versioning not yet supported for non-constant "
189 "step\n");
190 return false;
193 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
194 return true;
198 /* Function vect_analyze_data_ref_dependence.
200 Return TRUE if there (might) exist a dependence between a memory-reference
201 DRA and a memory-reference DRB. When versioning for alias may check a
202 dependence at run-time, return FALSE. Adjust *MAX_VF according to
203 the data dependence. */
205 static bool
206 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
207 loop_vec_info loop_vinfo, int *max_vf)
209 unsigned int i;
210 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
211 struct data_reference *dra = DDR_A (ddr);
212 struct data_reference *drb = DDR_B (ddr);
213 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
214 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
215 lambda_vector dist_v;
216 unsigned int loop_depth;
218 /* In loop analysis all data references should be vectorizable. */
219 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
220 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
221 gcc_unreachable ();
223 /* Independent data accesses. */
224 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
225 return false;
227 if (dra == drb
228 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
229 return false;
231 /* Unknown data dependence. */
232 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
234 /* If user asserted safelen consecutive iterations can be
235 executed concurrently, assume independence. */
236 if (loop->safelen >= 2)
238 if (loop->safelen < *max_vf)
239 *max_vf = loop->safelen;
240 return false;
243 if (STMT_VINFO_GATHER_P (stmtinfo_a)
244 || STMT_VINFO_GATHER_P (stmtinfo_b))
246 if (dump_enabled_p ())
248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
249 "versioning for alias not supported for: "
250 "can't determine dependence between ");
251 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
252 DR_REF (dra));
253 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
254 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
255 DR_REF (drb));
256 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
258 return true;
261 if (dump_enabled_p ())
263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
264 "versioning for alias required: "
265 "can't determine dependence between ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
267 DR_REF (dra));
268 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
269 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
270 DR_REF (drb));
271 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
274 /* Add to list of ddrs that need to be tested at run-time. */
275 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
278 /* Known data dependence. */
279 if (DDR_NUM_DIST_VECTS (ddr) == 0)
281 /* If user asserted safelen consecutive iterations can be
282 executed concurrently, assume independence. */
283 if (loop->safelen >= 2)
285 if (loop->safelen < *max_vf)
286 *max_vf = loop->safelen;
287 return false;
290 if (STMT_VINFO_GATHER_P (stmtinfo_a)
291 || STMT_VINFO_GATHER_P (stmtinfo_b))
293 if (dump_enabled_p ())
295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
296 "versioning for alias not supported for: "
297 "bad dist vector for ");
298 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
299 DR_REF (dra));
300 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
301 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
302 DR_REF (drb));
303 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
305 return true;
308 if (dump_enabled_p ())
310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
311 "versioning for alias required: "
312 "bad dist vector for ");
313 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
314 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
315 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
318 /* Add to list of ddrs that need to be tested at run-time. */
319 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
322 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
323 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
325 int dist = dist_v[loop_depth];
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location,
329 "dependence distance = %d.\n", dist);
331 if (dist == 0)
333 if (dump_enabled_p ())
335 dump_printf_loc (MSG_NOTE, vect_location,
336 "dependence distance == 0 between ");
337 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
338 dump_printf (MSG_NOTE, " and ");
339 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
340 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
343 /* When we perform grouped accesses and perform implicit CSE
344 by detecting equal accesses and doing disambiguation with
345 runtime alias tests like for
346 .. = a[i];
347 .. = a[i+1];
348 a[i] = ..;
349 a[i+1] = ..;
350 *p = ..;
351 .. = a[i];
352 .. = a[i+1];
353 where we will end up loading { a[i], a[i+1] } once, make
354 sure that inserting group loads before the first load and
355 stores after the last store will do the right thing. */
356 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
357 && GROUP_SAME_DR_STMT (stmtinfo_a))
358 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
359 && GROUP_SAME_DR_STMT (stmtinfo_b)))
361 gimple earlier_stmt;
362 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
363 if (DR_IS_WRITE
364 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
366 if (dump_enabled_p ())
367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
368 "READ_WRITE dependence in interleaving."
369 "\n");
370 return true;
374 continue;
377 if (dist > 0 && DDR_REVERSED_P (ddr))
379 /* If DDR_REVERSED_P the order of the data-refs in DDR was
380 reversed (to make distance vector positive), and the actual
381 distance is negative. */
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 "dependence distance negative.\n");
385 continue;
388 if (abs (dist) >= 2
389 && abs (dist) < *max_vf)
391 /* The dependence distance requires reduction of the maximal
392 vectorization factor. */
393 *max_vf = abs (dist);
394 if (dump_enabled_p ())
395 dump_printf_loc (MSG_NOTE, vect_location,
396 "adjusting maximal vectorization factor to %i\n",
397 *max_vf);
400 if (abs (dist) >= *max_vf)
402 /* Dependence distance does not create dependence, as far as
403 vectorization is concerned, in this case. */
404 if (dump_enabled_p ())
405 dump_printf_loc (MSG_NOTE, vect_location,
406 "dependence distance >= VF.\n");
407 continue;
410 if (dump_enabled_p ())
412 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
413 "not vectorized, possible dependence "
414 "between data-refs ");
415 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
416 dump_printf (MSG_NOTE, " and ");
417 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
418 dump_printf (MSG_NOTE, "\n");
421 return true;
424 return false;
427 /* Function vect_analyze_data_ref_dependences.
429 Examine all the data references in the loop, and make sure there do not
430 exist any data dependences between them. Set *MAX_VF according to
431 the maximum vectorization factor the data dependences allow. */
433 bool
434 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
436 unsigned int i;
437 struct data_dependence_relation *ddr;
439 if (dump_enabled_p ())
440 dump_printf_loc (MSG_NOTE, vect_location,
441 "=== vect_analyze_data_ref_dependences ===\n");
443 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
444 &LOOP_VINFO_DDRS (loop_vinfo),
445 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
446 return false;
448 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
449 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
450 return false;
452 return true;
456 /* Function vect_slp_analyze_data_ref_dependence.
458 Return TRUE if there (might) exist a dependence between a memory-reference
459 DRA and a memory-reference DRB. When versioning for alias may check a
460 dependence at run-time, return FALSE. Adjust *MAX_VF according to
461 the data dependence. */
463 static bool
464 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
466 struct data_reference *dra = DDR_A (ddr);
467 struct data_reference *drb = DDR_B (ddr);
469 /* We need to check dependences of statements marked as unvectorizable
470 as well, they still can prohibit vectorization. */
472 /* Independent data accesses. */
473 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
474 return false;
476 if (dra == drb)
477 return false;
479 /* Read-read is OK. */
480 if (DR_IS_READ (dra) && DR_IS_READ (drb))
481 return false;
483 /* If dra and drb are part of the same interleaving chain consider
484 them independent. */
485 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
486 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
487 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
488 return false;
490 /* Unknown data dependence. */
491 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
493 gimple earlier_stmt;
495 if (dump_enabled_p ())
497 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
498 "can't determine dependence between ");
499 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
500 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
501 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
502 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
505 /* We do not vectorize basic blocks with write-write dependencies. */
506 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
507 return true;
509 /* Check that it's not a load-after-store dependence. */
510 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
511 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
512 return true;
514 return false;
517 if (dump_enabled_p ())
519 dump_printf_loc (MSG_NOTE, vect_location,
520 "determined dependence between ");
521 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
522 dump_printf (MSG_NOTE, " and ");
523 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
524 dump_printf (MSG_NOTE, "\n");
527 /* Do not vectorize basic blocks with write-write dependences. */
528 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
529 return true;
531 /* Check dependence between DRA and DRB for basic block vectorization.
532 If the accesses share same bases and offsets, we can compare their initial
533 constant offsets to decide whether they differ or not. In case of a read-
534 write dependence we check that the load is before the store to ensure that
535 vectorization will not change the order of the accesses. */
537 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
538 gimple earlier_stmt;
540 /* Check that the data-refs have same bases and offsets. If not, we can't
541 determine if they are dependent. */
542 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
543 || !dr_equal_offsets_p (dra, drb))
544 return true;
546 /* Check the types. */
547 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
548 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
550 if (type_size_a != type_size_b
551 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
552 TREE_TYPE (DR_REF (drb))))
553 return true;
555 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
556 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
558 /* Two different locations - no dependence. */
559 if (init_a != init_b)
560 return false;
562 /* We have a read-write dependence. Check that the load is before the store.
563 When we vectorize basic blocks, vector load can be only before
564 corresponding scalar load, and vector store can be only after its
565 corresponding scalar store. So the order of the acceses is preserved in
566 case the load is before the store. */
567 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
568 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
569 return false;
571 return true;
575 /* Function vect_analyze_data_ref_dependences.
577 Examine all the data references in the basic-block, and make sure there
578 do not exist any data dependences between them. Set *MAX_VF according to
579 the maximum vectorization factor the data dependences allow. */
581 bool
582 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
584 struct data_dependence_relation *ddr;
585 unsigned int i;
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_NOTE, vect_location,
589 "=== vect_slp_analyze_data_ref_dependences ===\n");
591 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
592 &BB_VINFO_DDRS (bb_vinfo),
593 vNULL, true))
594 return false;
596 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
597 if (vect_slp_analyze_data_ref_dependence (ddr))
598 return false;
600 return true;
604 /* Function vect_compute_data_ref_alignment
606 Compute the misalignment of the data reference DR.
608 Output:
609 1. If during the misalignment computation it is found that the data reference
610 cannot be vectorized then false is returned.
611 2. DR_MISALIGNMENT (DR) is defined.
613 FOR NOW: No analysis is actually performed. Misalignment is calculated
614 only for trivial cases. TODO. */
616 static bool
617 vect_compute_data_ref_alignment (struct data_reference *dr)
619 gimple stmt = DR_STMT (dr);
620 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
621 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
622 struct loop *loop = NULL;
623 tree ref = DR_REF (dr);
624 tree vectype;
625 tree base, base_addr;
626 bool base_aligned;
627 tree misalign;
628 tree aligned_to, alignment;
630 if (dump_enabled_p ())
631 dump_printf_loc (MSG_NOTE, vect_location,
632 "vect_compute_data_ref_alignment:\n");
634 if (loop_vinfo)
635 loop = LOOP_VINFO_LOOP (loop_vinfo);
637 /* Initialize misalignment to unknown. */
638 SET_DR_MISALIGNMENT (dr, -1);
640 /* Strided loads perform only component accesses, misalignment information
641 is irrelevant for them. */
642 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
643 return true;
645 misalign = DR_INIT (dr);
646 aligned_to = DR_ALIGNED_TO (dr);
647 base_addr = DR_BASE_ADDRESS (dr);
648 vectype = STMT_VINFO_VECTYPE (stmt_info);
650 /* In case the dataref is in an inner-loop of the loop that is being
651 vectorized (LOOP), we use the base and misalignment information
652 relative to the outer-loop (LOOP). This is ok only if the misalignment
653 stays the same throughout the execution of the inner-loop, which is why
654 we have to check that the stride of the dataref in the inner-loop evenly
655 divides by the vector size. */
656 if (loop && nested_in_vect_loop_p (loop, stmt))
658 tree step = DR_STEP (dr);
659 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
661 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
663 if (dump_enabled_p ())
664 dump_printf_loc (MSG_NOTE, vect_location,
665 "inner step divides the vector-size.\n");
666 misalign = STMT_VINFO_DR_INIT (stmt_info);
667 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
668 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
670 else
672 if (dump_enabled_p ())
673 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
674 "inner step doesn't divide the vector-size.\n");
675 misalign = NULL_TREE;
679 /* Similarly, if we're doing basic-block vectorization, we can only use
680 base and misalignment information relative to an innermost loop if the
681 misalignment stays the same throughout the execution of the loop.
682 As above, this is the case if the stride of the dataref evenly divides
683 by the vector size. */
684 if (!loop)
686 tree step = DR_STEP (dr);
687 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
689 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
691 if (dump_enabled_p ())
692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
693 "SLP: step doesn't divide the vector-size.\n");
694 misalign = NULL_TREE;
698 base = build_fold_indirect_ref (base_addr);
699 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
701 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
702 || !misalign)
704 if (dump_enabled_p ())
706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
707 "Unknown alignment for access: ");
708 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
709 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
711 return true;
714 if ((DECL_P (base)
715 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
716 alignment) >= 0)
717 || (TREE_CODE (base_addr) == SSA_NAME
718 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
719 TREE_TYPE (base_addr)))),
720 alignment) >= 0)
721 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
722 base_aligned = true;
723 else
724 base_aligned = false;
726 if (!base_aligned)
728 /* Do not change the alignment of global variables here if
729 flag_section_anchors is enabled as we already generated
730 RTL for other functions. Most global variables should
731 have been aligned during the IPA increase_alignment pass. */
732 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
733 || (TREE_STATIC (base) && flag_section_anchors))
735 if (dump_enabled_p ())
737 dump_printf_loc (MSG_NOTE, vect_location,
738 "can't force alignment of ref: ");
739 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
740 dump_printf (MSG_NOTE, "\n");
742 return true;
745 /* Force the alignment of the decl.
746 NOTE: This is the only change to the code we make during
747 the analysis phase, before deciding to vectorize the loop. */
748 if (dump_enabled_p ())
750 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
751 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
752 dump_printf (MSG_NOTE, "\n");
755 ((dataref_aux *)dr->aux)->base_decl = base;
756 ((dataref_aux *)dr->aux)->base_misaligned = true;
759 /* If this is a backward running DR then first access in the larger
760 vectype actually is N-1 elements before the address in the DR.
761 Adjust misalign accordingly. */
762 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
764 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
765 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
766 otherwise we wouldn't be here. */
767 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
768 /* PLUS because DR_STEP was negative. */
769 misalign = size_binop (PLUS_EXPR, misalign, offset);
772 /* Modulo alignment. */
773 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
775 if (!tree_fits_uhwi_p (misalign))
777 /* Negative or overflowed misalignment value. */
778 if (dump_enabled_p ())
779 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
780 "unexpected misalign value\n");
781 return false;
784 SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
786 if (dump_enabled_p ())
788 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
789 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
790 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
791 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
794 return true;
798 /* Function vect_compute_data_refs_alignment
800 Compute the misalignment of data references in the loop.
801 Return FALSE if a data reference is found that cannot be vectorized. */
803 static bool
804 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
805 bb_vec_info bb_vinfo)
807 vec<data_reference_p> datarefs;
808 struct data_reference *dr;
809 unsigned int i;
811 if (loop_vinfo)
812 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
813 else
814 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
816 FOR_EACH_VEC_ELT (datarefs, i, dr)
817 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
818 && !vect_compute_data_ref_alignment (dr))
820 if (bb_vinfo)
822 /* Mark unsupported statement as unvectorizable. */
823 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
824 continue;
826 else
827 return false;
830 return true;
834 /* Function vect_update_misalignment_for_peel
836 DR - the data reference whose misalignment is to be adjusted.
837 DR_PEEL - the data reference whose misalignment is being made
838 zero in the vector loop by the peel.
839 NPEEL - the number of iterations in the peel loop if the misalignment
840 of DR_PEEL is known at compile time. */
842 static void
843 vect_update_misalignment_for_peel (struct data_reference *dr,
844 struct data_reference *dr_peel, int npeel)
846 unsigned int i;
847 vec<dr_p> same_align_drs;
848 struct data_reference *current_dr;
849 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
850 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
851 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
852 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
854 /* For interleaved data accesses the step in the loop must be multiplied by
855 the size of the interleaving group. */
856 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
857 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
858 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
859 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
861 /* It can be assumed that the data refs with the same alignment as dr_peel
862 are aligned in the vector loop. */
863 same_align_drs
864 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
865 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
867 if (current_dr != dr)
868 continue;
869 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
870 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
871 SET_DR_MISALIGNMENT (dr, 0);
872 return;
875 if (known_alignment_for_access_p (dr)
876 && known_alignment_for_access_p (dr_peel))
878 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
879 int misal = DR_MISALIGNMENT (dr);
880 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
881 misal += negative ? -npeel * dr_size : npeel * dr_size;
882 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
883 SET_DR_MISALIGNMENT (dr, misal);
884 return;
887 if (dump_enabled_p ())
888 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
889 SET_DR_MISALIGNMENT (dr, -1);
893 /* Function vect_verify_datarefs_alignment
895 Return TRUE if all data references in the loop can be
896 handled with respect to alignment. */
898 bool
899 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
901 vec<data_reference_p> datarefs;
902 struct data_reference *dr;
903 enum dr_alignment_support supportable_dr_alignment;
904 unsigned int i;
906 if (loop_vinfo)
907 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
908 else
909 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
911 FOR_EACH_VEC_ELT (datarefs, i, dr)
913 gimple stmt = DR_STMT (dr);
914 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
916 if (!STMT_VINFO_RELEVANT_P (stmt_info))
917 continue;
919 /* For interleaving, only the alignment of the first access matters.
920 Skip statements marked as not vectorizable. */
921 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
922 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
923 || !STMT_VINFO_VECTORIZABLE (stmt_info))
924 continue;
926 /* Strided loads perform only component accesses, alignment is
927 irrelevant for them. */
928 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
929 continue;
931 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
932 if (!supportable_dr_alignment)
934 if (dump_enabled_p ())
936 if (DR_IS_READ (dr))
937 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
938 "not vectorized: unsupported unaligned load.");
939 else
940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
941 "not vectorized: unsupported unaligned "
942 "store.");
944 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
945 DR_REF (dr));
946 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
948 return false;
950 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE, vect_location,
952 "Vectorizing an unaligned access.\n");
954 return true;
957 /* Given an memory reference EXP return whether its alignment is less
958 than its size. */
960 static bool
961 not_size_aligned (tree exp)
963 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
964 return true;
966 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp)))
967 > get_object_alignment (exp));
970 /* Function vector_alignment_reachable_p
972 Return true if vector alignment for DR is reachable by peeling
973 a few loop iterations. Return false otherwise. */
975 static bool
976 vector_alignment_reachable_p (struct data_reference *dr)
978 gimple stmt = DR_STMT (dr);
979 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
980 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
982 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
984 /* For interleaved access we peel only if number of iterations in
985 the prolog loop ({VF - misalignment}), is a multiple of the
986 number of the interleaved accesses. */
987 int elem_size, mis_in_elements;
988 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
990 /* FORNOW: handle only known alignment. */
991 if (!known_alignment_for_access_p (dr))
992 return false;
994 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
995 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
997 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
998 return false;
1001 /* If misalignment is known at the compile time then allow peeling
1002 only if natural alignment is reachable through peeling. */
1003 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1005 HOST_WIDE_INT elmsize =
1006 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1007 if (dump_enabled_p ())
1009 dump_printf_loc (MSG_NOTE, vect_location,
1010 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1011 dump_printf (MSG_NOTE,
1012 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1014 if (DR_MISALIGNMENT (dr) % elmsize)
1016 if (dump_enabled_p ())
1017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1018 "data size does not divide the misalignment.\n");
1019 return false;
1023 if (!known_alignment_for_access_p (dr))
1025 tree type = TREE_TYPE (DR_REF (dr));
1026 bool is_packed = not_size_aligned (DR_REF (dr));
1027 if (dump_enabled_p ())
1028 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1029 "Unknown misalignment, is_packed = %d\n",is_packed);
1030 if ((TYPE_USER_ALIGN (type) && !is_packed)
1031 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1032 return true;
1033 else
1034 return false;
1037 return true;
1041 /* Calculate the cost of the memory access represented by DR. */
1043 static void
1044 vect_get_data_access_cost (struct data_reference *dr,
1045 unsigned int *inside_cost,
1046 unsigned int *outside_cost,
1047 stmt_vector_for_cost *body_cost_vec)
1049 gimple stmt = DR_STMT (dr);
1050 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1051 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1052 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1053 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1054 int ncopies = vf / nunits;
1056 if (DR_IS_READ (dr))
1057 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1058 NULL, body_cost_vec, false);
1059 else
1060 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1062 if (dump_enabled_p ())
1063 dump_printf_loc (MSG_NOTE, vect_location,
1064 "vect_get_data_access_cost: inside_cost = %d, "
1065 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1069 /* Insert DR into peeling hash table with NPEEL as key. */
1071 static void
1072 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1073 int npeel)
1075 struct _vect_peel_info elem, *slot;
1076 _vect_peel_info **new_slot;
1077 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1079 elem.npeel = npeel;
1080 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
1081 if (slot)
1082 slot->count++;
1083 else
1085 slot = XNEW (struct _vect_peel_info);
1086 slot->npeel = npeel;
1087 slot->dr = dr;
1088 slot->count = 1;
1089 new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
1090 *new_slot = slot;
1093 if (!supportable_dr_alignment && unlimited_cost_model ())
1094 slot->count += VECT_MAX_COST;
1098 /* Traverse peeling hash table to find peeling option that aligns maximum
1099 number of data accesses. */
1102 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1103 _vect_peel_extended_info *max)
1105 vect_peel_info elem = *slot;
1107 if (elem->count > max->peel_info.count
1108 || (elem->count == max->peel_info.count
1109 && max->peel_info.npeel > elem->npeel))
1111 max->peel_info.npeel = elem->npeel;
1112 max->peel_info.count = elem->count;
1113 max->peel_info.dr = elem->dr;
1116 return 1;
1120 /* Traverse peeling hash table and calculate cost for each peeling option.
1121 Find the one with the lowest cost. */
1124 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1125 _vect_peel_extended_info *min)
1127 vect_peel_info elem = *slot;
1128 int save_misalignment, dummy;
1129 unsigned int inside_cost = 0, outside_cost = 0, i;
1130 gimple stmt = DR_STMT (elem->dr);
1131 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1132 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1133 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1134 struct data_reference *dr;
1135 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1136 int single_iter_cost;
1138 prologue_cost_vec.create (2);
1139 body_cost_vec.create (2);
1140 epilogue_cost_vec.create (2);
1142 FOR_EACH_VEC_ELT (datarefs, i, dr)
1144 stmt = DR_STMT (dr);
1145 stmt_info = vinfo_for_stmt (stmt);
1146 /* For interleaving, only the alignment of the first access
1147 matters. */
1148 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1149 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1150 continue;
1152 save_misalignment = DR_MISALIGNMENT (dr);
1153 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1154 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1155 &body_cost_vec);
1156 SET_DR_MISALIGNMENT (dr, save_misalignment);
1159 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1160 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1161 &dummy, single_iter_cost,
1162 &prologue_cost_vec,
1163 &epilogue_cost_vec);
1165 /* Prologue and epilogue costs are added to the target model later.
1166 These costs depend only on the scalar iteration cost, the
1167 number of peeling iterations finally chosen, and the number of
1168 misaligned statements. So discard the information found here. */
1169 prologue_cost_vec.release ();
1170 epilogue_cost_vec.release ();
1172 if (inside_cost < min->inside_cost
1173 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1175 min->inside_cost = inside_cost;
1176 min->outside_cost = outside_cost;
1177 min->body_cost_vec.release ();
1178 min->body_cost_vec = body_cost_vec;
1179 min->peel_info.dr = elem->dr;
1180 min->peel_info.npeel = elem->npeel;
1182 else
1183 body_cost_vec.release ();
1185 return 1;
1189 /* Choose best peeling option by traversing peeling hash table and either
1190 choosing an option with the lowest cost (if cost model is enabled) or the
1191 option that aligns as many accesses as possible. */
1193 static struct data_reference *
1194 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1195 unsigned int *npeel,
1196 stmt_vector_for_cost *body_cost_vec)
1198 struct _vect_peel_extended_info res;
1200 res.peel_info.dr = NULL;
1201 res.body_cost_vec = stmt_vector_for_cost ();
1203 if (!unlimited_cost_model ())
1205 res.inside_cost = INT_MAX;
1206 res.outside_cost = INT_MAX;
1207 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1208 .traverse <_vect_peel_extended_info *,
1209 vect_peeling_hash_get_lowest_cost> (&res);
1211 else
1213 res.peel_info.count = 0;
1214 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1215 .traverse <_vect_peel_extended_info *,
1216 vect_peeling_hash_get_most_frequent> (&res);
1219 *npeel = res.peel_info.npeel;
1220 *body_cost_vec = res.body_cost_vec;
1221 return res.peel_info.dr;
1225 /* Function vect_enhance_data_refs_alignment
1227 This pass will use loop versioning and loop peeling in order to enhance
1228 the alignment of data references in the loop.
1230 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1231 original loop is to be vectorized. Any other loops that are created by
1232 the transformations performed in this pass - are not supposed to be
1233 vectorized. This restriction will be relaxed.
1235 This pass will require a cost model to guide it whether to apply peeling
1236 or versioning or a combination of the two. For example, the scheme that
1237 intel uses when given a loop with several memory accesses, is as follows:
1238 choose one memory access ('p') which alignment you want to force by doing
1239 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1240 other accesses are not necessarily aligned, or (2) use loop versioning to
1241 generate one loop in which all accesses are aligned, and another loop in
1242 which only 'p' is necessarily aligned.
1244 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1245 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1246 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1248 Devising a cost model is the most critical aspect of this work. It will
1249 guide us on which access to peel for, whether to use loop versioning, how
1250 many versions to create, etc. The cost model will probably consist of
1251 generic considerations as well as target specific considerations (on
1252 powerpc for example, misaligned stores are more painful than misaligned
1253 loads).
1255 Here are the general steps involved in alignment enhancements:
1257 -- original loop, before alignment analysis:
1258 for (i=0; i<N; i++){
1259 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1260 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1263 -- After vect_compute_data_refs_alignment:
1264 for (i=0; i<N; i++){
1265 x = q[i]; # DR_MISALIGNMENT(q) = 3
1266 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1269 -- Possibility 1: we do loop versioning:
1270 if (p is aligned) {
1271 for (i=0; i<N; i++){ # loop 1A
1272 x = q[i]; # DR_MISALIGNMENT(q) = 3
1273 p[i] = y; # DR_MISALIGNMENT(p) = 0
1276 else {
1277 for (i=0; i<N; i++){ # loop 1B
1278 x = q[i]; # DR_MISALIGNMENT(q) = 3
1279 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1283 -- Possibility 2: we do loop peeling:
1284 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1285 x = q[i];
1286 p[i] = y;
1288 for (i = 3; i < N; i++){ # loop 2A
1289 x = q[i]; # DR_MISALIGNMENT(q) = 0
1290 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1293 -- Possibility 3: combination of loop peeling and versioning:
1294 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1295 x = q[i];
1296 p[i] = y;
1298 if (p is aligned) {
1299 for (i = 3; i<N; i++){ # loop 3A
1300 x = q[i]; # DR_MISALIGNMENT(q) = 0
1301 p[i] = y; # DR_MISALIGNMENT(p) = 0
1304 else {
1305 for (i = 3; i<N; i++){ # loop 3B
1306 x = q[i]; # DR_MISALIGNMENT(q) = 0
1307 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1311 These loops are later passed to loop_transform to be vectorized. The
1312 vectorizer will use the alignment information to guide the transformation
1313 (whether to generate regular loads/stores, or with special handling for
1314 misalignment). */
1316 bool
1317 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1319 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1320 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1321 enum dr_alignment_support supportable_dr_alignment;
1322 struct data_reference *dr0 = NULL, *first_store = NULL;
1323 struct data_reference *dr;
1324 unsigned int i, j;
1325 bool do_peeling = false;
1326 bool do_versioning = false;
1327 bool stat;
1328 gimple stmt;
1329 stmt_vec_info stmt_info;
1330 unsigned int npeel = 0;
1331 bool all_misalignments_unknown = true;
1332 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1333 unsigned possible_npeel_number = 1;
1334 tree vectype;
1335 unsigned int nelements, mis, same_align_drs_max = 0;
1336 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1338 if (dump_enabled_p ())
1339 dump_printf_loc (MSG_NOTE, vect_location,
1340 "=== vect_enhance_data_refs_alignment ===\n");
1342 /* While cost model enhancements are expected in the future, the high level
1343 view of the code at this time is as follows:
1345 A) If there is a misaligned access then see if peeling to align
1346 this access can make all data references satisfy
1347 vect_supportable_dr_alignment. If so, update data structures
1348 as needed and return true.
1350 B) If peeling wasn't possible and there is a data reference with an
1351 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1352 then see if loop versioning checks can be used to make all data
1353 references satisfy vect_supportable_dr_alignment. If so, update
1354 data structures as needed and return true.
1356 C) If neither peeling nor versioning were successful then return false if
1357 any data reference does not satisfy vect_supportable_dr_alignment.
1359 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1361 Note, Possibility 3 above (which is peeling and versioning together) is not
1362 being done at this time. */
1364 /* (1) Peeling to force alignment. */
1366 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1367 Considerations:
1368 + How many accesses will become aligned due to the peeling
1369 - How many accesses will become unaligned due to the peeling,
1370 and the cost of misaligned accesses.
1371 - The cost of peeling (the extra runtime checks, the increase
1372 in code size). */
1374 FOR_EACH_VEC_ELT (datarefs, i, dr)
1376 stmt = DR_STMT (dr);
1377 stmt_info = vinfo_for_stmt (stmt);
1379 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1380 continue;
1382 /* For interleaving, only the alignment of the first access
1383 matters. */
1384 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1385 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1386 continue;
1388 /* For invariant accesses there is nothing to enhance. */
1389 if (integer_zerop (DR_STEP (dr)))
1390 continue;
1392 /* Strided loads perform only component accesses, alignment is
1393 irrelevant for them. */
1394 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1395 continue;
1397 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1398 do_peeling = vector_alignment_reachable_p (dr);
1399 if (do_peeling)
1401 if (known_alignment_for_access_p (dr))
1403 unsigned int npeel_tmp;
1404 bool negative = tree_int_cst_compare (DR_STEP (dr),
1405 size_zero_node) < 0;
1407 /* Save info about DR in the hash table. */
1408 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1409 LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
1411 vectype = STMT_VINFO_VECTYPE (stmt_info);
1412 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1413 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1414 TREE_TYPE (DR_REF (dr))));
1415 npeel_tmp = (negative
1416 ? (mis - nelements) : (nelements - mis))
1417 & (nelements - 1);
1419 /* For multiple types, it is possible that the bigger type access
1420 will have more than one peeling option. E.g., a loop with two
1421 types: one of size (vector size / 4), and the other one of
1422 size (vector size / 8). Vectorization factor will 8. If both
1423 access are misaligned by 3, the first one needs one scalar
1424 iteration to be aligned, and the second one needs 5. But the
1425 the first one will be aligned also by peeling 5 scalar
1426 iterations, and in that case both accesses will be aligned.
1427 Hence, except for the immediate peeling amount, we also want
1428 to try to add full vector size, while we don't exceed
1429 vectorization factor.
1430 We do this automtically for cost model, since we calculate cost
1431 for every peeling option. */
1432 if (unlimited_cost_model ())
1433 possible_npeel_number = vf /nelements;
1435 /* Handle the aligned case. We may decide to align some other
1436 access, making DR unaligned. */
1437 if (DR_MISALIGNMENT (dr) == 0)
1439 npeel_tmp = 0;
1440 if (unlimited_cost_model ())
1441 possible_npeel_number++;
1444 for (j = 0; j < possible_npeel_number; j++)
1446 gcc_assert (npeel_tmp <= vf);
1447 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1448 npeel_tmp += nelements;
1451 all_misalignments_unknown = false;
1452 /* Data-ref that was chosen for the case that all the
1453 misalignments are unknown is not relevant anymore, since we
1454 have a data-ref with known alignment. */
1455 dr0 = NULL;
1457 else
1459 /* If we don't know any misalignment values, we prefer
1460 peeling for data-ref that has the maximum number of data-refs
1461 with the same alignment, unless the target prefers to align
1462 stores over load. */
1463 if (all_misalignments_unknown)
1465 unsigned same_align_drs
1466 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1467 if (!dr0
1468 || same_align_drs_max < same_align_drs)
1470 same_align_drs_max = same_align_drs;
1471 dr0 = dr;
1473 /* For data-refs with the same number of related
1474 accesses prefer the one where the misalign
1475 computation will be invariant in the outermost loop. */
1476 else if (same_align_drs_max == same_align_drs)
1478 struct loop *ivloop0, *ivloop;
1479 ivloop0 = outermost_invariant_loop_for_expr
1480 (loop, DR_BASE_ADDRESS (dr0));
1481 ivloop = outermost_invariant_loop_for_expr
1482 (loop, DR_BASE_ADDRESS (dr));
1483 if ((ivloop && !ivloop0)
1484 || (ivloop && ivloop0
1485 && flow_loop_nested_p (ivloop, ivloop0)))
1486 dr0 = dr;
1489 if (!first_store && DR_IS_WRITE (dr))
1490 first_store = dr;
1493 /* If there are both known and unknown misaligned accesses in the
1494 loop, we choose peeling amount according to the known
1495 accesses. */
1496 if (!supportable_dr_alignment)
1498 dr0 = dr;
1499 if (!first_store && DR_IS_WRITE (dr))
1500 first_store = dr;
1504 else
1506 if (!aligned_access_p (dr))
1508 if (dump_enabled_p ())
1509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1510 "vector alignment may not be reachable\n");
1511 break;
1516 /* Check if we can possibly peel the loop. */
1517 if (!vect_can_advance_ivs_p (loop_vinfo)
1518 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1519 do_peeling = false;
1521 if (do_peeling && all_misalignments_unknown
1522 && vect_supportable_dr_alignment (dr0, false))
1525 /* Check if the target requires to prefer stores over loads, i.e., if
1526 misaligned stores are more expensive than misaligned loads (taking
1527 drs with same alignment into account). */
1528 if (first_store && DR_IS_READ (dr0))
1530 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1531 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1532 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1533 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1534 stmt_vector_for_cost dummy;
1535 dummy.create (2);
1537 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1538 &dummy);
1539 vect_get_data_access_cost (first_store, &store_inside_cost,
1540 &store_outside_cost, &dummy);
1542 dummy.release ();
1544 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1545 aligning the load DR0). */
1546 load_inside_penalty = store_inside_cost;
1547 load_outside_penalty = store_outside_cost;
1548 for (i = 0;
1549 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1550 DR_STMT (first_store))).iterate (i, &dr);
1551 i++)
1552 if (DR_IS_READ (dr))
1554 load_inside_penalty += load_inside_cost;
1555 load_outside_penalty += load_outside_cost;
1557 else
1559 load_inside_penalty += store_inside_cost;
1560 load_outside_penalty += store_outside_cost;
1563 /* Calculate the penalty for leaving DR0 unaligned (by
1564 aligning the FIRST_STORE). */
1565 store_inside_penalty = load_inside_cost;
1566 store_outside_penalty = load_outside_cost;
1567 for (i = 0;
1568 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1569 DR_STMT (dr0))).iterate (i, &dr);
1570 i++)
1571 if (DR_IS_READ (dr))
1573 store_inside_penalty += load_inside_cost;
1574 store_outside_penalty += load_outside_cost;
1576 else
1578 store_inside_penalty += store_inside_cost;
1579 store_outside_penalty += store_outside_cost;
1582 if (load_inside_penalty > store_inside_penalty
1583 || (load_inside_penalty == store_inside_penalty
1584 && load_outside_penalty > store_outside_penalty))
1585 dr0 = first_store;
1588 /* In case there are only loads with different unknown misalignments, use
1589 peeling only if it may help to align other accesses in the loop. */
1590 if (!first_store
1591 && !STMT_VINFO_SAME_ALIGN_REFS (
1592 vinfo_for_stmt (DR_STMT (dr0))).length ()
1593 && vect_supportable_dr_alignment (dr0, false)
1594 != dr_unaligned_supported)
1595 do_peeling = false;
1598 if (do_peeling && !dr0)
1600 /* Peeling is possible, but there is no data access that is not supported
1601 unless aligned. So we try to choose the best possible peeling. */
1603 /* We should get here only if there are drs with known misalignment. */
1604 gcc_assert (!all_misalignments_unknown);
1606 /* Choose the best peeling from the hash table. */
1607 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1608 &body_cost_vec);
1609 if (!dr0 || !npeel)
1610 do_peeling = false;
1613 if (do_peeling)
1615 stmt = DR_STMT (dr0);
1616 stmt_info = vinfo_for_stmt (stmt);
1617 vectype = STMT_VINFO_VECTYPE (stmt_info);
1618 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1620 if (known_alignment_for_access_p (dr0))
1622 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1623 size_zero_node) < 0;
1624 if (!npeel)
1626 /* Since it's known at compile time, compute the number of
1627 iterations in the peeled loop (the peeling factor) for use in
1628 updating DR_MISALIGNMENT values. The peeling factor is the
1629 vectorization factor minus the misalignment as an element
1630 count. */
1631 mis = DR_MISALIGNMENT (dr0);
1632 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1633 npeel = ((negative ? mis - nelements : nelements - mis)
1634 & (nelements - 1));
1637 /* For interleaved data access every iteration accesses all the
1638 members of the group, therefore we divide the number of iterations
1639 by the group size. */
1640 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1641 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1642 npeel /= GROUP_SIZE (stmt_info);
1644 if (dump_enabled_p ())
1645 dump_printf_loc (MSG_NOTE, vect_location,
1646 "Try peeling by %d\n", npeel);
1649 /* Ensure that all data refs can be vectorized after the peel. */
1650 FOR_EACH_VEC_ELT (datarefs, i, dr)
1652 int save_misalignment;
1654 if (dr == dr0)
1655 continue;
1657 stmt = DR_STMT (dr);
1658 stmt_info = vinfo_for_stmt (stmt);
1659 /* For interleaving, only the alignment of the first access
1660 matters. */
1661 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1662 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1663 continue;
1665 /* Strided loads perform only component accesses, alignment is
1666 irrelevant for them. */
1667 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1668 continue;
1670 save_misalignment = DR_MISALIGNMENT (dr);
1671 vect_update_misalignment_for_peel (dr, dr0, npeel);
1672 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1673 SET_DR_MISALIGNMENT (dr, save_misalignment);
1675 if (!supportable_dr_alignment)
1677 do_peeling = false;
1678 break;
1682 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1684 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1685 if (!stat)
1686 do_peeling = false;
1687 else
1689 body_cost_vec.release ();
1690 return stat;
1694 if (do_peeling)
1696 unsigned max_allowed_peel
1697 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1698 if (max_allowed_peel != (unsigned)-1)
1700 unsigned max_peel = npeel;
1701 if (max_peel == 0)
1703 gimple dr_stmt = DR_STMT (dr0);
1704 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1705 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1706 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1708 if (max_peel > max_allowed_peel)
1710 do_peeling = false;
1711 if (dump_enabled_p ())
1712 dump_printf_loc (MSG_NOTE, vect_location,
1713 "Disable peeling, max peels reached: %d\n", max_peel);
1718 if (do_peeling)
1720 stmt_info_for_cost *si;
1721 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1723 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1724 If the misalignment of DR_i is identical to that of dr0 then set
1725 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1726 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1727 by the peeling factor times the element size of DR_i (MOD the
1728 vectorization factor times the size). Otherwise, the
1729 misalignment of DR_i must be set to unknown. */
1730 FOR_EACH_VEC_ELT (datarefs, i, dr)
1731 if (dr != dr0)
1732 vect_update_misalignment_for_peel (dr, dr0, npeel);
1734 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1735 if (npeel)
1736 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1737 else
1738 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
1739 SET_DR_MISALIGNMENT (dr0, 0);
1740 if (dump_enabled_p ())
1742 dump_printf_loc (MSG_NOTE, vect_location,
1743 "Alignment of access forced using peeling.\n");
1744 dump_printf_loc (MSG_NOTE, vect_location,
1745 "Peeling for alignment will be applied.\n");
1747 /* We've delayed passing the inside-loop peeling costs to the
1748 target cost model until we were sure peeling would happen.
1749 Do so now. */
1750 if (body_cost_vec.exists ())
1752 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1754 struct _stmt_vec_info *stmt_info
1755 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1756 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1757 si->misalign, vect_body);
1759 body_cost_vec.release ();
1762 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1763 gcc_assert (stat);
1764 return stat;
1768 body_cost_vec.release ();
1770 /* (2) Versioning to force alignment. */
1772 /* Try versioning if:
1773 1) optimize loop for speed
1774 2) there is at least one unsupported misaligned data ref with an unknown
1775 misalignment, and
1776 3) all misaligned data refs with a known misalignment are supported, and
1777 4) the number of runtime alignment checks is within reason. */
1779 do_versioning =
1780 optimize_loop_nest_for_speed_p (loop)
1781 && (!loop->inner); /* FORNOW */
1783 if (do_versioning)
1785 FOR_EACH_VEC_ELT (datarefs, i, dr)
1787 stmt = DR_STMT (dr);
1788 stmt_info = vinfo_for_stmt (stmt);
1790 /* For interleaving, only the alignment of the first access
1791 matters. */
1792 if (aligned_access_p (dr)
1793 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1794 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1795 continue;
1797 /* Strided loads perform only component accesses, alignment is
1798 irrelevant for them. */
1799 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1800 continue;
1802 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1804 if (!supportable_dr_alignment)
1806 gimple stmt;
1807 int mask;
1808 tree vectype;
1810 if (known_alignment_for_access_p (dr)
1811 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1812 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1814 do_versioning = false;
1815 break;
1818 stmt = DR_STMT (dr);
1819 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1820 gcc_assert (vectype);
1822 /* The rightmost bits of an aligned address must be zeros.
1823 Construct the mask needed for this test. For example,
1824 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1825 mask must be 15 = 0xf. */
1826 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1828 /* FORNOW: use the same mask to test all potentially unaligned
1829 references in the loop. The vectorizer currently supports
1830 a single vector size, see the reference to
1831 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1832 vectorization factor is computed. */
1833 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1834 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1835 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1836 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1837 DR_STMT (dr));
1841 /* Versioning requires at least one misaligned data reference. */
1842 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1843 do_versioning = false;
1844 else if (!do_versioning)
1845 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1848 if (do_versioning)
1850 vec<gimple> may_misalign_stmts
1851 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1852 gimple stmt;
1854 /* It can now be assumed that the data references in the statements
1855 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1856 of the loop being vectorized. */
1857 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1859 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1860 dr = STMT_VINFO_DATA_REF (stmt_info);
1861 SET_DR_MISALIGNMENT (dr, 0);
1862 if (dump_enabled_p ())
1863 dump_printf_loc (MSG_NOTE, vect_location,
1864 "Alignment of access forced using versioning.\n");
1867 if (dump_enabled_p ())
1868 dump_printf_loc (MSG_NOTE, vect_location,
1869 "Versioning for alignment will be applied.\n");
1871 /* Peeling and versioning can't be done together at this time. */
1872 gcc_assert (! (do_peeling && do_versioning));
1874 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1875 gcc_assert (stat);
1876 return stat;
1879 /* This point is reached if neither peeling nor versioning is being done. */
1880 gcc_assert (! (do_peeling || do_versioning));
1882 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1883 return stat;
1887 /* Function vect_find_same_alignment_drs.
1889 Update group and alignment relations according to the chosen
1890 vectorization factor. */
1892 static void
1893 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1894 loop_vec_info loop_vinfo)
1896 unsigned int i;
1897 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1898 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1899 struct data_reference *dra = DDR_A (ddr);
1900 struct data_reference *drb = DDR_B (ddr);
1901 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1902 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1903 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1904 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1905 lambda_vector dist_v;
1906 unsigned int loop_depth;
1908 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1909 return;
1911 if (dra == drb)
1912 return;
1914 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1915 return;
1917 /* Loop-based vectorization and known data dependence. */
1918 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1919 return;
1921 /* Data-dependence analysis reports a distance vector of zero
1922 for data-references that overlap only in the first iteration
1923 but have different sign step (see PR45764).
1924 So as a sanity check require equal DR_STEP. */
1925 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1926 return;
1928 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1929 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1931 int dist = dist_v[loop_depth];
1933 if (dump_enabled_p ())
1934 dump_printf_loc (MSG_NOTE, vect_location,
1935 "dependence distance = %d.\n", dist);
1937 /* Same loop iteration. */
1938 if (dist == 0
1939 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1941 /* Two references with distance zero have the same alignment. */
1942 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1943 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1944 if (dump_enabled_p ())
1946 dump_printf_loc (MSG_NOTE, vect_location,
1947 "accesses have the same alignment.\n");
1948 dump_printf (MSG_NOTE,
1949 "dependence distance modulo vf == 0 between ");
1950 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1951 dump_printf (MSG_NOTE, " and ");
1952 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1953 dump_printf (MSG_NOTE, "\n");
1960 /* Function vect_analyze_data_refs_alignment
1962 Analyze the alignment of the data-references in the loop.
1963 Return FALSE if a data reference is found that cannot be vectorized. */
1965 bool
1966 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1967 bb_vec_info bb_vinfo)
1969 if (dump_enabled_p ())
1970 dump_printf_loc (MSG_NOTE, vect_location,
1971 "=== vect_analyze_data_refs_alignment ===\n");
1973 /* Mark groups of data references with same alignment using
1974 data dependence information. */
1975 if (loop_vinfo)
1977 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1978 struct data_dependence_relation *ddr;
1979 unsigned int i;
1981 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1982 vect_find_same_alignment_drs (ddr, loop_vinfo);
1985 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1987 if (dump_enabled_p ())
1988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1989 "not vectorized: can't calculate alignment "
1990 "for data ref.\n");
1991 return false;
1994 return true;
1998 /* Analyze groups of accesses: check that DR belongs to a group of
1999 accesses of legal size, step, etc. Detect gaps, single element
2000 interleaving, and other special cases. Set grouped access info.
2001 Collect groups of strided stores for further use in SLP analysis. */
2003 static bool
2004 vect_analyze_group_access (struct data_reference *dr)
2006 tree step = DR_STEP (dr);
2007 tree scalar_type = TREE_TYPE (DR_REF (dr));
2008 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2009 gimple stmt = DR_STMT (dr);
2010 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2011 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2012 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2013 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2014 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2015 bool slp_impossible = false;
2016 struct loop *loop = NULL;
2018 if (loop_vinfo)
2019 loop = LOOP_VINFO_LOOP (loop_vinfo);
2021 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2022 size of the interleaving group (including gaps). */
2023 groupsize = absu_hwi (dr_step) / type_size;
2025 /* Not consecutive access is possible only if it is a part of interleaving. */
2026 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2028 /* Check if it this DR is a part of interleaving, and is a single
2029 element of the group that is accessed in the loop. */
2031 /* Gaps are supported only for loads. STEP must be a multiple of the type
2032 size. The size of the group must be a power of 2. */
2033 if (DR_IS_READ (dr)
2034 && (dr_step % type_size) == 0
2035 && groupsize > 0
2036 && exact_log2 (groupsize) != -1)
2038 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2039 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2040 if (dump_enabled_p ())
2042 dump_printf_loc (MSG_NOTE, vect_location,
2043 "Detected single element interleaving ");
2044 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2045 dump_printf (MSG_NOTE, " step ");
2046 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2047 dump_printf (MSG_NOTE, "\n");
2050 if (loop_vinfo)
2052 if (dump_enabled_p ())
2053 dump_printf_loc (MSG_NOTE, vect_location,
2054 "Data access with gaps requires scalar "
2055 "epilogue loop\n");
2056 if (loop->inner)
2058 if (dump_enabled_p ())
2059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2060 "Peeling for outer loop is not"
2061 " supported\n");
2062 return false;
2065 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2068 return true;
2071 if (dump_enabled_p ())
2073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2074 "not consecutive access ");
2075 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2076 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2079 if (bb_vinfo)
2081 /* Mark the statement as unvectorizable. */
2082 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2083 return true;
2086 return false;
2089 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2091 /* First stmt in the interleaving chain. Check the chain. */
2092 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2093 struct data_reference *data_ref = dr;
2094 unsigned int count = 1;
2095 tree prev_init = DR_INIT (data_ref);
2096 gimple prev = stmt;
2097 HOST_WIDE_INT diff, gaps = 0;
2098 unsigned HOST_WIDE_INT count_in_bytes;
2100 while (next)
2102 /* Skip same data-refs. In case that two or more stmts share
2103 data-ref (supported only for loads), we vectorize only the first
2104 stmt, and the rest get their vectorized loads from the first
2105 one. */
2106 if (!tree_int_cst_compare (DR_INIT (data_ref),
2107 DR_INIT (STMT_VINFO_DATA_REF (
2108 vinfo_for_stmt (next)))))
2110 if (DR_IS_WRITE (data_ref))
2112 if (dump_enabled_p ())
2113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2114 "Two store stmts share the same dr.\n");
2115 return false;
2118 /* For load use the same data-ref load. */
2119 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2121 prev = next;
2122 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2123 continue;
2126 prev = next;
2127 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2129 /* All group members have the same STEP by construction. */
2130 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2132 /* Check that the distance between two accesses is equal to the type
2133 size. Otherwise, we have gaps. */
2134 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2135 - TREE_INT_CST_LOW (prev_init)) / type_size;
2136 if (diff != 1)
2138 /* FORNOW: SLP of accesses with gaps is not supported. */
2139 slp_impossible = true;
2140 if (DR_IS_WRITE (data_ref))
2142 if (dump_enabled_p ())
2143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2144 "interleaved store with gaps\n");
2145 return false;
2148 gaps += diff - 1;
2151 last_accessed_element += diff;
2153 /* Store the gap from the previous member of the group. If there is no
2154 gap in the access, GROUP_GAP is always 1. */
2155 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2157 prev_init = DR_INIT (data_ref);
2158 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2159 /* Count the number of data-refs in the chain. */
2160 count++;
2163 /* COUNT is the number of accesses found, we multiply it by the size of
2164 the type to get COUNT_IN_BYTES. */
2165 count_in_bytes = type_size * count;
2167 /* Check that the size of the interleaving (including gaps) is not
2168 greater than STEP. */
2169 if (dr_step != 0
2170 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2172 if (dump_enabled_p ())
2174 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2175 "interleaving size is greater than step for ");
2176 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2177 DR_REF (dr));
2178 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2180 return false;
2183 /* Check that the size of the interleaving is equal to STEP for stores,
2184 i.e., that there are no gaps. */
2185 if (dr_step != 0
2186 && absu_hwi (dr_step) != count_in_bytes)
2188 if (DR_IS_READ (dr))
2190 slp_impossible = true;
2191 /* There is a gap after the last load in the group. This gap is a
2192 difference between the groupsize and the number of elements.
2193 When there is no gap, this difference should be 0. */
2194 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2196 else
2198 if (dump_enabled_p ())
2199 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2200 "interleaved store with gaps\n");
2201 return false;
2205 /* Check that STEP is a multiple of type size. */
2206 if (dr_step != 0
2207 && (dr_step % type_size) != 0)
2209 if (dump_enabled_p ())
2211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2212 "step is not a multiple of type size: step ");
2213 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2214 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2215 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2216 TYPE_SIZE_UNIT (scalar_type));
2217 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2219 return false;
2222 if (groupsize == 0)
2223 groupsize = count;
2225 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2226 if (dump_enabled_p ())
2227 dump_printf_loc (MSG_NOTE, vect_location,
2228 "Detected interleaving of size %d\n", (int)groupsize);
2230 /* SLP: create an SLP data structure for every interleaving group of
2231 stores for further analysis in vect_analyse_slp. */
2232 if (DR_IS_WRITE (dr) && !slp_impossible)
2234 if (loop_vinfo)
2235 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2236 if (bb_vinfo)
2237 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2240 /* There is a gap in the end of the group. */
2241 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2243 if (dump_enabled_p ())
2244 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2245 "Data access with gaps requires scalar "
2246 "epilogue loop\n");
2247 if (loop->inner)
2249 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2251 "Peeling for outer loop is not supported\n");
2252 return false;
2255 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2259 return true;
2263 /* Analyze the access pattern of the data-reference DR.
2264 In case of non-consecutive accesses call vect_analyze_group_access() to
2265 analyze groups of accesses. */
2267 static bool
2268 vect_analyze_data_ref_access (struct data_reference *dr)
2270 tree step = DR_STEP (dr);
2271 tree scalar_type = TREE_TYPE (DR_REF (dr));
2272 gimple stmt = DR_STMT (dr);
2273 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2274 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2275 struct loop *loop = NULL;
2277 if (loop_vinfo)
2278 loop = LOOP_VINFO_LOOP (loop_vinfo);
2280 if (loop_vinfo && !step)
2282 if (dump_enabled_p ())
2283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2284 "bad data-ref access in loop\n");
2285 return false;
2288 /* Allow invariant loads in not nested loops. */
2289 if (loop_vinfo && integer_zerop (step))
2291 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2292 if (nested_in_vect_loop_p (loop, stmt))
2294 if (dump_enabled_p ())
2295 dump_printf_loc (MSG_NOTE, vect_location,
2296 "zero step in inner loop of nest\n");
2297 return false;
2299 return DR_IS_READ (dr);
2302 if (loop && nested_in_vect_loop_p (loop, stmt))
2304 /* Interleaved accesses are not yet supported within outer-loop
2305 vectorization for references in the inner-loop. */
2306 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2308 /* For the rest of the analysis we use the outer-loop step. */
2309 step = STMT_VINFO_DR_STEP (stmt_info);
2310 if (integer_zerop (step))
2312 if (dump_enabled_p ())
2313 dump_printf_loc (MSG_NOTE, vect_location,
2314 "zero step in outer loop.\n");
2315 if (DR_IS_READ (dr))
2316 return true;
2317 else
2318 return false;
2322 /* Consecutive? */
2323 if (TREE_CODE (step) == INTEGER_CST)
2325 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2326 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2327 || (dr_step < 0
2328 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2330 /* Mark that it is not interleaving. */
2331 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2332 return true;
2336 if (loop && nested_in_vect_loop_p (loop, stmt))
2338 if (dump_enabled_p ())
2339 dump_printf_loc (MSG_NOTE, vect_location,
2340 "grouped access in outer loop.\n");
2341 return false;
2344 /* Assume this is a DR handled by non-constant strided load case. */
2345 if (TREE_CODE (step) != INTEGER_CST)
2346 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2348 /* Not consecutive access - check if it's a part of interleaving group. */
2349 return vect_analyze_group_access (dr);
2354 /* A helper function used in the comparator function to sort data
2355 references. T1 and T2 are two data references to be compared.
2356 The function returns -1, 0, or 1. */
2358 static int
2359 compare_tree (tree t1, tree t2)
2361 int i, cmp;
2362 enum tree_code code;
2363 char tclass;
2365 if (t1 == t2)
2366 return 0;
2367 if (t1 == NULL)
2368 return -1;
2369 if (t2 == NULL)
2370 return 1;
2373 if (TREE_CODE (t1) != TREE_CODE (t2))
2374 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2376 code = TREE_CODE (t1);
2377 switch (code)
2379 /* For const values, we can just use hash values for comparisons. */
2380 case INTEGER_CST:
2381 case REAL_CST:
2382 case FIXED_CST:
2383 case STRING_CST:
2384 case COMPLEX_CST:
2385 case VECTOR_CST:
2387 hashval_t h1 = iterative_hash_expr (t1, 0);
2388 hashval_t h2 = iterative_hash_expr (t2, 0);
2389 if (h1 != h2)
2390 return h1 < h2 ? -1 : 1;
2391 break;
2394 case SSA_NAME:
2395 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2396 if (cmp != 0)
2397 return cmp;
2399 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2400 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2401 break;
2403 default:
2404 tclass = TREE_CODE_CLASS (code);
2406 /* For var-decl, we could compare their UIDs. */
2407 if (tclass == tcc_declaration)
2409 if (DECL_UID (t1) != DECL_UID (t2))
2410 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2411 break;
2414 /* For expressions with operands, compare their operands recursively. */
2415 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2417 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2418 if (cmp != 0)
2419 return cmp;
2423 return 0;
2427 /* Compare two data-references DRA and DRB to group them into chunks
2428 suitable for grouping. */
2430 static int
2431 dr_group_sort_cmp (const void *dra_, const void *drb_)
2433 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2434 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2435 int cmp;
2437 /* Stabilize sort. */
2438 if (dra == drb)
2439 return 0;
2441 /* Ordering of DRs according to base. */
2442 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2444 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2445 if (cmp != 0)
2446 return cmp;
2449 /* And according to DR_OFFSET. */
2450 if (!dr_equal_offsets_p (dra, drb))
2452 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2453 if (cmp != 0)
2454 return cmp;
2457 /* Put reads before writes. */
2458 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2459 return DR_IS_READ (dra) ? -1 : 1;
2461 /* Then sort after access size. */
2462 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2463 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2465 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2466 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2467 if (cmp != 0)
2468 return cmp;
2471 /* And after step. */
2472 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2474 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2475 if (cmp != 0)
2476 return cmp;
2479 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2480 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2481 if (cmp == 0)
2482 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2483 return cmp;
2486 /* Function vect_analyze_data_ref_accesses.
2488 Analyze the access pattern of all the data references in the loop.
2490 FORNOW: the only access pattern that is considered vectorizable is a
2491 simple step 1 (consecutive) access.
2493 FORNOW: handle only arrays and pointer accesses. */
2495 bool
2496 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2498 unsigned int i;
2499 vec<data_reference_p> datarefs;
2500 struct data_reference *dr;
2502 if (dump_enabled_p ())
2503 dump_printf_loc (MSG_NOTE, vect_location,
2504 "=== vect_analyze_data_ref_accesses ===\n");
2506 if (loop_vinfo)
2507 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2508 else
2509 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2511 if (datarefs.is_empty ())
2512 return true;
2514 /* Sort the array of datarefs to make building the interleaving chains
2515 linear. */
2516 qsort (datarefs.address (), datarefs.length (),
2517 sizeof (data_reference_p), dr_group_sort_cmp);
2519 /* Build the interleaving chains. */
2520 for (i = 0; i < datarefs.length () - 1;)
2522 data_reference_p dra = datarefs[i];
2523 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2524 stmt_vec_info lastinfo = NULL;
2525 for (i = i + 1; i < datarefs.length (); ++i)
2527 data_reference_p drb = datarefs[i];
2528 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2530 /* ??? Imperfect sorting (non-compatible types, non-modulo
2531 accesses, same accesses) can lead to a group to be artificially
2532 split here as we don't just skip over those. If it really
2533 matters we can push those to a worklist and re-iterate
2534 over them. The we can just skip ahead to the next DR here. */
2536 /* Check that the data-refs have same first location (except init)
2537 and they are both either store or load (not load and store). */
2538 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2539 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2540 DR_BASE_ADDRESS (drb), 0)
2541 || !dr_equal_offsets_p (dra, drb))
2542 break;
2544 /* Check that the data-refs have the same constant size and step. */
2545 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2546 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2547 if (!tree_fits_uhwi_p (sza)
2548 || !tree_fits_uhwi_p (szb)
2549 || !tree_int_cst_equal (sza, szb)
2550 || !tree_fits_shwi_p (DR_STEP (dra))
2551 || !tree_fits_shwi_p (DR_STEP (drb))
2552 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2553 break;
2555 /* Do not place the same access in the interleaving chain twice. */
2556 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2557 break;
2559 /* Check the types are compatible.
2560 ??? We don't distinguish this during sorting. */
2561 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2562 TREE_TYPE (DR_REF (drb))))
2563 break;
2565 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2566 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2567 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2568 gcc_assert (init_a < init_b);
2570 /* If init_b == init_a + the size of the type * k, we have an
2571 interleaving, and DRA is accessed before DRB. */
2572 HOST_WIDE_INT type_size_a = TREE_INT_CST_LOW (sza);
2573 if ((init_b - init_a) % type_size_a != 0)
2574 break;
2576 /* The step (if not zero) is greater than the difference between
2577 data-refs' inits. This splits groups into suitable sizes. */
2578 HOST_WIDE_INT step = TREE_INT_CST_LOW (DR_STEP (dra));
2579 if (step != 0 && step <= (init_b - init_a))
2580 break;
2582 if (dump_enabled_p ())
2584 dump_printf_loc (MSG_NOTE, vect_location,
2585 "Detected interleaving ");
2586 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2587 dump_printf (MSG_NOTE, " and ");
2588 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2589 dump_printf (MSG_NOTE, "\n");
2592 /* Link the found element into the group list. */
2593 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2595 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2596 lastinfo = stmtinfo_a;
2598 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2599 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2600 lastinfo = stmtinfo_b;
2604 FOR_EACH_VEC_ELT (datarefs, i, dr)
2605 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2606 && !vect_analyze_data_ref_access (dr))
2608 if (dump_enabled_p ())
2609 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2610 "not vectorized: complicated access pattern.\n");
2612 if (bb_vinfo)
2614 /* Mark the statement as not vectorizable. */
2615 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2616 continue;
2618 else
2619 return false;
2622 return true;
2626 /* Operator == between two dr_with_seg_len objects.
2628 This equality operator is used to make sure two data refs
2629 are the same one so that we will consider to combine the
2630 aliasing checks of those two pairs of data dependent data
2631 refs. */
2633 static bool
2634 operator == (const dr_with_seg_len& d1,
2635 const dr_with_seg_len& d2)
2637 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2638 DR_BASE_ADDRESS (d2.dr), 0)
2639 && compare_tree (d1.offset, d2.offset) == 0
2640 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2643 /* Function comp_dr_with_seg_len_pair.
2645 Comparison function for sorting objects of dr_with_seg_len_pair_t
2646 so that we can combine aliasing checks in one scan. */
2648 static int
2649 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2651 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2652 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2654 const dr_with_seg_len &p11 = p1->first,
2655 &p12 = p1->second,
2656 &p21 = p2->first,
2657 &p22 = p2->second;
2659 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2660 if a and c have the same basic address snd step, and b and d have the same
2661 address and step. Therefore, if any a&c or b&d don't have the same address
2662 and step, we don't care the order of those two pairs after sorting. */
2663 int comp_res;
2665 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2666 DR_BASE_ADDRESS (p21.dr))) != 0)
2667 return comp_res;
2668 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2669 DR_BASE_ADDRESS (p22.dr))) != 0)
2670 return comp_res;
2671 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2672 return comp_res;
2673 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2674 return comp_res;
2675 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2676 return comp_res;
2677 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2678 return comp_res;
2680 return 0;
2683 template <class T> static void
2684 swap (T& a, T& b)
2686 T c (a);
2687 a = b;
2688 b = c;
2691 /* Function vect_vfa_segment_size.
2693 Create an expression that computes the size of segment
2694 that will be accessed for a data reference. The functions takes into
2695 account that realignment loads may access one more vector.
2697 Input:
2698 DR: The data reference.
2699 LENGTH_FACTOR: segment length to consider.
2701 Return an expression whose value is the size of segment which will be
2702 accessed by DR. */
2704 static tree
2705 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2707 tree segment_length;
2709 if (integer_zerop (DR_STEP (dr)))
2710 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2711 else
2712 segment_length = size_binop (MULT_EXPR,
2713 fold_convert (sizetype, DR_STEP (dr)),
2714 fold_convert (sizetype, length_factor));
2716 if (vect_supportable_dr_alignment (dr, false)
2717 == dr_explicit_realign_optimized)
2719 tree vector_size = TYPE_SIZE_UNIT
2720 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2722 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2724 return segment_length;
2727 /* Function vect_prune_runtime_alias_test_list.
2729 Prune a list of ddrs to be tested at run-time by versioning for alias.
2730 Merge several alias checks into one if possible.
2731 Return FALSE if resulting list of ddrs is longer then allowed by
2732 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2734 bool
2735 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2737 vec<ddr_p> may_alias_ddrs =
2738 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2739 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2740 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2741 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2742 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2744 ddr_p ddr;
2745 unsigned int i;
2746 tree length_factor;
2748 if (dump_enabled_p ())
2749 dump_printf_loc (MSG_NOTE, vect_location,
2750 "=== vect_prune_runtime_alias_test_list ===\n");
2752 if (may_alias_ddrs.is_empty ())
2753 return true;
2755 /* Basically, for each pair of dependent data refs store_ptr_0
2756 and load_ptr_0, we create an expression:
2758 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2759 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2761 for aliasing checks. However, in some cases we can decrease
2762 the number of checks by combining two checks into one. For
2763 example, suppose we have another pair of data refs store_ptr_0
2764 and load_ptr_1, and if the following condition is satisfied:
2766 load_ptr_0 < load_ptr_1 &&
2767 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2769 (this condition means, in each iteration of vectorized loop,
2770 the accessed memory of store_ptr_0 cannot be between the memory
2771 of load_ptr_0 and load_ptr_1.)
2773 we then can use only the following expression to finish the
2774 alising checks between store_ptr_0 & load_ptr_0 and
2775 store_ptr_0 & load_ptr_1:
2777 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2778 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2780 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2781 same basic address. */
2783 comp_alias_ddrs.create (may_alias_ddrs.length ());
2785 /* First, we collect all data ref pairs for aliasing checks. */
2786 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2788 struct data_reference *dr_a, *dr_b;
2789 gimple dr_group_first_a, dr_group_first_b;
2790 tree segment_length_a, segment_length_b;
2791 gimple stmt_a, stmt_b;
2793 dr_a = DDR_A (ddr);
2794 stmt_a = DR_STMT (DDR_A (ddr));
2795 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2796 if (dr_group_first_a)
2798 stmt_a = dr_group_first_a;
2799 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2802 dr_b = DDR_B (ddr);
2803 stmt_b = DR_STMT (DDR_B (ddr));
2804 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2805 if (dr_group_first_b)
2807 stmt_b = dr_group_first_b;
2808 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2811 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2812 length_factor = scalar_loop_iters;
2813 else
2814 length_factor = size_int (vect_factor);
2815 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2816 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2818 dr_with_seg_len_pair_t dr_with_seg_len_pair
2819 (dr_with_seg_len (dr_a, segment_length_a),
2820 dr_with_seg_len (dr_b, segment_length_b));
2822 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2823 swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2825 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2828 /* Second, we sort the collected data ref pairs so that we can scan
2829 them once to combine all possible aliasing checks. */
2830 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2832 /* Third, we scan the sorted dr pairs and check if we can combine
2833 alias checks of two neighbouring dr pairs. */
2834 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2836 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2837 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2838 *dr_b1 = &comp_alias_ddrs[i-1].second,
2839 *dr_a2 = &comp_alias_ddrs[i].first,
2840 *dr_b2 = &comp_alias_ddrs[i].second;
2842 /* Remove duplicate data ref pairs. */
2843 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2845 if (dump_enabled_p ())
2847 dump_printf_loc (MSG_NOTE, vect_location,
2848 "found equal ranges ");
2849 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2850 DR_REF (dr_a1->dr));
2851 dump_printf (MSG_NOTE, ", ");
2852 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2853 DR_REF (dr_b1->dr));
2854 dump_printf (MSG_NOTE, " and ");
2855 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2856 DR_REF (dr_a2->dr));
2857 dump_printf (MSG_NOTE, ", ");
2858 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2859 DR_REF (dr_b2->dr));
2860 dump_printf (MSG_NOTE, "\n");
2863 comp_alias_ddrs.ordered_remove (i--);
2864 continue;
2867 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2869 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2870 and DR_A1 and DR_A2 are two consecutive memrefs. */
2871 if (*dr_a1 == *dr_a2)
2873 swap (dr_a1, dr_b1);
2874 swap (dr_a2, dr_b2);
2877 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2878 DR_BASE_ADDRESS (dr_a2->dr),
2880 || !tree_fits_shwi_p (dr_a1->offset)
2881 || !tree_fits_shwi_p (dr_a2->offset))
2882 continue;
2884 HOST_WIDE_INT diff = TREE_INT_CST_LOW (dr_a2->offset) -
2885 TREE_INT_CST_LOW (dr_a1->offset);
2888 /* Now we check if the following condition is satisfied:
2890 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2892 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2893 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2894 have to make a best estimation. We can get the minimum value
2895 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2896 then either of the following two conditions can guarantee the
2897 one above:
2899 1: DIFF <= MIN_SEG_LEN_B
2900 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2904 HOST_WIDE_INT
2905 min_seg_len_b = (TREE_CODE (dr_b1->seg_len) == INTEGER_CST) ?
2906 TREE_INT_CST_LOW (dr_b1->seg_len) :
2907 vect_factor;
2909 if (diff <= min_seg_len_b
2910 || (TREE_CODE (dr_a1->seg_len) == INTEGER_CST
2911 && diff - (HOST_WIDE_INT) TREE_INT_CST_LOW (dr_a1->seg_len) <
2912 min_seg_len_b))
2914 dr_a1->seg_len = size_binop (PLUS_EXPR,
2915 dr_a2->seg_len, size_int (diff));
2916 comp_alias_ddrs.ordered_remove (i--);
2921 if ((int) comp_alias_ddrs.length () >
2922 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2924 if (dump_enabled_p ())
2926 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2927 "disable versioning for alias - max number of "
2928 "generated checks exceeded.\n");
2931 return false;
2934 return true;
2937 /* Check whether a non-affine read in stmt is suitable for gather load
2938 and if so, return a builtin decl for that operation. */
2940 tree
2941 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2942 tree *offp, int *scalep)
2944 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2945 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2946 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2947 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2948 tree offtype = NULL_TREE;
2949 tree decl, base, off;
2950 enum machine_mode pmode;
2951 int punsignedp, pvolatilep;
2953 /* The gather builtins need address of the form
2954 loop_invariant + vector * {1, 2, 4, 8}
2956 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2957 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2958 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2959 multiplications and additions in it. To get a vector, we need
2960 a single SSA_NAME that will be defined in the loop and will
2961 contain everything that is not loop invariant and that can be
2962 vectorized. The following code attempts to find such a preexistng
2963 SSA_NAME OFF and put the loop invariants into a tree BASE
2964 that can be gimplified before the loop. */
2965 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2966 &pmode, &punsignedp, &pvolatilep, false);
2967 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2969 if (TREE_CODE (base) == MEM_REF)
2971 if (!integer_zerop (TREE_OPERAND (base, 1)))
2973 if (off == NULL_TREE)
2975 double_int moff = mem_ref_offset (base);
2976 off = double_int_to_tree (sizetype, moff);
2978 else
2979 off = size_binop (PLUS_EXPR, off,
2980 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2982 base = TREE_OPERAND (base, 0);
2984 else
2985 base = build_fold_addr_expr (base);
2987 if (off == NULL_TREE)
2988 off = size_zero_node;
2990 /* If base is not loop invariant, either off is 0, then we start with just
2991 the constant offset in the loop invariant BASE and continue with base
2992 as OFF, otherwise give up.
2993 We could handle that case by gimplifying the addition of base + off
2994 into some SSA_NAME and use that as off, but for now punt. */
2995 if (!expr_invariant_in_loop_p (loop, base))
2997 if (!integer_zerop (off))
2998 return NULL_TREE;
2999 off = base;
3000 base = size_int (pbitpos / BITS_PER_UNIT);
3002 /* Otherwise put base + constant offset into the loop invariant BASE
3003 and continue with OFF. */
3004 else
3006 base = fold_convert (sizetype, base);
3007 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3010 /* OFF at this point may be either a SSA_NAME or some tree expression
3011 from get_inner_reference. Try to peel off loop invariants from it
3012 into BASE as long as possible. */
3013 STRIP_NOPS (off);
3014 while (offtype == NULL_TREE)
3016 enum tree_code code;
3017 tree op0, op1, add = NULL_TREE;
3019 if (TREE_CODE (off) == SSA_NAME)
3021 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3023 if (expr_invariant_in_loop_p (loop, off))
3024 return NULL_TREE;
3026 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3027 break;
3029 op0 = gimple_assign_rhs1 (def_stmt);
3030 code = gimple_assign_rhs_code (def_stmt);
3031 op1 = gimple_assign_rhs2 (def_stmt);
3033 else
3035 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3036 return NULL_TREE;
3037 code = TREE_CODE (off);
3038 extract_ops_from_tree (off, &code, &op0, &op1);
3040 switch (code)
3042 case POINTER_PLUS_EXPR:
3043 case PLUS_EXPR:
3044 if (expr_invariant_in_loop_p (loop, op0))
3046 add = op0;
3047 off = op1;
3048 do_add:
3049 add = fold_convert (sizetype, add);
3050 if (scale != 1)
3051 add = size_binop (MULT_EXPR, add, size_int (scale));
3052 base = size_binop (PLUS_EXPR, base, add);
3053 continue;
3055 if (expr_invariant_in_loop_p (loop, op1))
3057 add = op1;
3058 off = op0;
3059 goto do_add;
3061 break;
3062 case MINUS_EXPR:
3063 if (expr_invariant_in_loop_p (loop, op1))
3065 add = fold_convert (sizetype, op1);
3066 add = size_binop (MINUS_EXPR, size_zero_node, add);
3067 off = op0;
3068 goto do_add;
3070 break;
3071 case MULT_EXPR:
3072 if (scale == 1 && tree_fits_shwi_p (op1))
3074 scale = tree_to_shwi (op1);
3075 off = op0;
3076 continue;
3078 break;
3079 case SSA_NAME:
3080 off = op0;
3081 continue;
3082 CASE_CONVERT:
3083 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3084 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3085 break;
3086 if (TYPE_PRECISION (TREE_TYPE (op0))
3087 == TYPE_PRECISION (TREE_TYPE (off)))
3089 off = op0;
3090 continue;
3092 if (TYPE_PRECISION (TREE_TYPE (op0))
3093 < TYPE_PRECISION (TREE_TYPE (off)))
3095 off = op0;
3096 offtype = TREE_TYPE (off);
3097 STRIP_NOPS (off);
3098 continue;
3100 break;
3101 default:
3102 break;
3104 break;
3107 /* If at the end OFF still isn't a SSA_NAME or isn't
3108 defined in the loop, punt. */
3109 if (TREE_CODE (off) != SSA_NAME
3110 || expr_invariant_in_loop_p (loop, off))
3111 return NULL_TREE;
3113 if (offtype == NULL_TREE)
3114 offtype = TREE_TYPE (off);
3116 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3117 offtype, scale);
3118 if (decl == NULL_TREE)
3119 return NULL_TREE;
3121 if (basep)
3122 *basep = base;
3123 if (offp)
3124 *offp = off;
3125 if (scalep)
3126 *scalep = scale;
3127 return decl;
3130 /* Function vect_analyze_data_refs.
3132 Find all the data references in the loop or basic block.
3134 The general structure of the analysis of data refs in the vectorizer is as
3135 follows:
3136 1- vect_analyze_data_refs(loop/bb): call
3137 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3138 in the loop/bb and their dependences.
3139 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3140 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3141 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3145 bool
3146 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3147 bb_vec_info bb_vinfo,
3148 int *min_vf)
3150 struct loop *loop = NULL;
3151 basic_block bb = NULL;
3152 unsigned int i;
3153 vec<data_reference_p> datarefs;
3154 struct data_reference *dr;
3155 tree scalar_type;
3157 if (dump_enabled_p ())
3158 dump_printf_loc (MSG_NOTE, vect_location,
3159 "=== vect_analyze_data_refs ===\n");
3161 if (loop_vinfo)
3163 loop = LOOP_VINFO_LOOP (loop_vinfo);
3164 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))
3165 || find_data_references_in_loop
3166 (loop, &LOOP_VINFO_DATAREFS (loop_vinfo)))
3168 if (dump_enabled_p ())
3169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3170 "not vectorized: loop contains function calls"
3171 " or data references that cannot be analyzed\n");
3172 return false;
3175 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3177 else
3179 gimple_stmt_iterator gsi;
3181 bb = BB_VINFO_BB (bb_vinfo);
3182 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3184 gimple stmt = gsi_stmt (gsi);
3185 if (!find_data_references_in_stmt (NULL, stmt,
3186 &BB_VINFO_DATAREFS (bb_vinfo)))
3188 /* Mark the rest of the basic-block as unvectorizable. */
3189 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3191 stmt = gsi_stmt (gsi);
3192 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3194 break;
3198 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3201 /* Go through the data-refs, check that the analysis succeeded. Update
3202 pointer from stmt_vec_info struct to DR and vectype. */
3204 FOR_EACH_VEC_ELT (datarefs, i, dr)
3206 gimple stmt;
3207 stmt_vec_info stmt_info;
3208 tree base, offset, init;
3209 bool gather = false;
3210 bool simd_lane_access = false;
3211 int vf;
3213 again:
3214 if (!dr || !DR_REF (dr))
3216 if (dump_enabled_p ())
3217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3218 "not vectorized: unhandled data-ref\n");
3219 return false;
3222 stmt = DR_STMT (dr);
3223 stmt_info = vinfo_for_stmt (stmt);
3225 /* Discard clobbers from the dataref vector. We will remove
3226 clobber stmts during vectorization. */
3227 if (gimple_clobber_p (stmt))
3229 if (i == datarefs.length () - 1)
3231 datarefs.pop ();
3232 break;
3234 datarefs[i] = datarefs.pop ();
3235 goto again;
3238 /* Check that analysis of the data-ref succeeded. */
3239 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3240 || !DR_STEP (dr))
3242 bool maybe_gather
3243 = DR_IS_READ (dr)
3244 && !TREE_THIS_VOLATILE (DR_REF (dr))
3245 && targetm.vectorize.builtin_gather != NULL;
3246 bool maybe_simd_lane_access
3247 = loop_vinfo && loop->simduid;
3249 /* If target supports vector gather loads, or if this might be
3250 a SIMD lane access, see if they can't be used. */
3251 if (loop_vinfo
3252 && (maybe_gather || maybe_simd_lane_access)
3253 && !nested_in_vect_loop_p (loop, stmt))
3255 struct data_reference *newdr
3256 = create_data_ref (NULL, loop_containing_stmt (stmt),
3257 DR_REF (dr), stmt, true);
3258 gcc_assert (newdr != NULL && DR_REF (newdr));
3259 if (DR_BASE_ADDRESS (newdr)
3260 && DR_OFFSET (newdr)
3261 && DR_INIT (newdr)
3262 && DR_STEP (newdr)
3263 && integer_zerop (DR_STEP (newdr)))
3265 if (maybe_simd_lane_access)
3267 tree off = DR_OFFSET (newdr);
3268 STRIP_NOPS (off);
3269 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3270 && TREE_CODE (off) == MULT_EXPR
3271 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3273 tree step = TREE_OPERAND (off, 1);
3274 off = TREE_OPERAND (off, 0);
3275 STRIP_NOPS (off);
3276 if (CONVERT_EXPR_P (off)
3277 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3278 0)))
3279 < TYPE_PRECISION (TREE_TYPE (off)))
3280 off = TREE_OPERAND (off, 0);
3281 if (TREE_CODE (off) == SSA_NAME)
3283 gimple def = SSA_NAME_DEF_STMT (off);
3284 tree reft = TREE_TYPE (DR_REF (newdr));
3285 if (gimple_call_internal_p (def)
3286 && gimple_call_internal_fn (def)
3287 == IFN_GOMP_SIMD_LANE)
3289 tree arg = gimple_call_arg (def, 0);
3290 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3291 arg = SSA_NAME_VAR (arg);
3292 if (arg == loop->simduid
3293 /* For now. */
3294 && tree_int_cst_equal
3295 (TYPE_SIZE_UNIT (reft),
3296 step))
3298 DR_OFFSET (newdr) = ssize_int (0);
3299 DR_STEP (newdr) = step;
3300 DR_ALIGNED_TO (newdr)
3301 = size_int (BIGGEST_ALIGNMENT);
3302 dr = newdr;
3303 simd_lane_access = true;
3309 if (!simd_lane_access && maybe_gather)
3311 dr = newdr;
3312 gather = true;
3315 if (!gather && !simd_lane_access)
3316 free_data_ref (newdr);
3319 if (!gather && !simd_lane_access)
3321 if (dump_enabled_p ())
3323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3324 "not vectorized: data ref analysis "
3325 "failed ");
3326 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3327 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3330 if (bb_vinfo)
3331 break;
3333 return false;
3337 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3339 if (dump_enabled_p ())
3340 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3341 "not vectorized: base addr of dr is a "
3342 "constant\n");
3344 if (bb_vinfo)
3345 break;
3347 if (gather || simd_lane_access)
3348 free_data_ref (dr);
3349 return false;
3352 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3354 if (dump_enabled_p ())
3356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3357 "not vectorized: volatile type ");
3358 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3359 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3362 if (bb_vinfo)
3363 break;
3365 return false;
3368 if (stmt_can_throw_internal (stmt))
3370 if (dump_enabled_p ())
3372 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3373 "not vectorized: statement can throw an "
3374 "exception ");
3375 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3376 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3379 if (bb_vinfo)
3380 break;
3382 if (gather || simd_lane_access)
3383 free_data_ref (dr);
3384 return false;
3387 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3388 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3390 if (dump_enabled_p ())
3392 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3393 "not vectorized: statement is bitfield "
3394 "access ");
3395 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3396 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3399 if (bb_vinfo)
3400 break;
3402 if (gather || simd_lane_access)
3403 free_data_ref (dr);
3404 return false;
3407 base = unshare_expr (DR_BASE_ADDRESS (dr));
3408 offset = unshare_expr (DR_OFFSET (dr));
3409 init = unshare_expr (DR_INIT (dr));
3411 if (is_gimple_call (stmt))
3413 if (dump_enabled_p ())
3415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3416 "not vectorized: dr in a call ");
3417 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3418 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3421 if (bb_vinfo)
3422 break;
3424 if (gather || simd_lane_access)
3425 free_data_ref (dr);
3426 return false;
3429 /* Update DR field in stmt_vec_info struct. */
3431 /* If the dataref is in an inner-loop of the loop that is considered for
3432 for vectorization, we also want to analyze the access relative to
3433 the outer-loop (DR contains information only relative to the
3434 inner-most enclosing loop). We do that by building a reference to the
3435 first location accessed by the inner-loop, and analyze it relative to
3436 the outer-loop. */
3437 if (loop && nested_in_vect_loop_p (loop, stmt))
3439 tree outer_step, outer_base, outer_init;
3440 HOST_WIDE_INT pbitsize, pbitpos;
3441 tree poffset;
3442 enum machine_mode pmode;
3443 int punsignedp, pvolatilep;
3444 affine_iv base_iv, offset_iv;
3445 tree dinit;
3447 /* Build a reference to the first location accessed by the
3448 inner-loop: *(BASE+INIT). (The first location is actually
3449 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3450 tree inner_base = build_fold_indirect_ref
3451 (fold_build_pointer_plus (base, init));
3453 if (dump_enabled_p ())
3455 dump_printf_loc (MSG_NOTE, vect_location,
3456 "analyze in outer-loop: ");
3457 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3458 dump_printf (MSG_NOTE, "\n");
3461 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3462 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3463 gcc_assert (outer_base != NULL_TREE);
3465 if (pbitpos % BITS_PER_UNIT != 0)
3467 if (dump_enabled_p ())
3468 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3469 "failed: bit offset alignment.\n");
3470 return false;
3473 outer_base = build_fold_addr_expr (outer_base);
3474 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3475 &base_iv, false))
3477 if (dump_enabled_p ())
3478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3479 "failed: evolution of base is not affine.\n");
3480 return false;
3483 if (offset)
3485 if (poffset)
3486 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3487 poffset);
3488 else
3489 poffset = offset;
3492 if (!poffset)
3494 offset_iv.base = ssize_int (0);
3495 offset_iv.step = ssize_int (0);
3497 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3498 &offset_iv, false))
3500 if (dump_enabled_p ())
3501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3502 "evolution of offset is not affine.\n");
3503 return false;
3506 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3507 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3508 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3509 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3510 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3512 outer_step = size_binop (PLUS_EXPR,
3513 fold_convert (ssizetype, base_iv.step),
3514 fold_convert (ssizetype, offset_iv.step));
3516 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3517 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3518 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3519 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3520 STMT_VINFO_DR_OFFSET (stmt_info) =
3521 fold_convert (ssizetype, offset_iv.base);
3522 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3523 size_int (highest_pow2_factor (offset_iv.base));
3525 if (dump_enabled_p ())
3527 dump_printf_loc (MSG_NOTE, vect_location,
3528 "\touter base_address: ");
3529 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3530 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3531 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3532 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3533 STMT_VINFO_DR_OFFSET (stmt_info));
3534 dump_printf (MSG_NOTE,
3535 "\n\touter constant offset from base address: ");
3536 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3537 STMT_VINFO_DR_INIT (stmt_info));
3538 dump_printf (MSG_NOTE, "\n\touter step: ");
3539 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3540 STMT_VINFO_DR_STEP (stmt_info));
3541 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3542 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3543 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3544 dump_printf (MSG_NOTE, "\n");
3548 if (STMT_VINFO_DATA_REF (stmt_info))
3550 if (dump_enabled_p ())
3552 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3553 "not vectorized: more than one data ref "
3554 "in stmt: ");
3555 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3556 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3559 if (bb_vinfo)
3560 break;
3562 if (gather || simd_lane_access)
3563 free_data_ref (dr);
3564 return false;
3567 STMT_VINFO_DATA_REF (stmt_info) = dr;
3568 if (simd_lane_access)
3570 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3571 datarefs[i] = dr;
3574 /* Set vectype for STMT. */
3575 scalar_type = TREE_TYPE (DR_REF (dr));
3576 STMT_VINFO_VECTYPE (stmt_info) =
3577 get_vectype_for_scalar_type (scalar_type);
3578 if (!STMT_VINFO_VECTYPE (stmt_info))
3580 if (dump_enabled_p ())
3582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3583 "not vectorized: no vectype for stmt: ");
3584 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3585 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3586 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3587 scalar_type);
3588 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3591 if (bb_vinfo)
3592 break;
3594 if (gather || simd_lane_access)
3596 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3597 free_data_ref (dr);
3599 return false;
3601 else
3603 if (dump_enabled_p ())
3605 dump_printf_loc (MSG_NOTE, vect_location,
3606 "got vectype for stmt: ");
3607 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3608 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3609 STMT_VINFO_VECTYPE (stmt_info));
3610 dump_printf (MSG_NOTE, "\n");
3614 /* Adjust the minimal vectorization factor according to the
3615 vector type. */
3616 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3617 if (vf > *min_vf)
3618 *min_vf = vf;
3620 if (gather)
3622 tree off;
3624 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3625 if (gather
3626 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3627 gather = false;
3628 if (!gather)
3630 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3631 free_data_ref (dr);
3632 if (dump_enabled_p ())
3634 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3635 "not vectorized: not suitable for gather "
3636 "load ");
3637 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3638 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3640 return false;
3643 datarefs[i] = dr;
3644 STMT_VINFO_GATHER_P (stmt_info) = true;
3646 else if (loop_vinfo
3647 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3649 if (nested_in_vect_loop_p (loop, stmt)
3650 || !DR_IS_READ (dr))
3652 if (dump_enabled_p ())
3654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3655 "not vectorized: not suitable for strided "
3656 "load ");
3657 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3658 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3660 return false;
3662 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3666 /* If we stopped analysis at the first dataref we could not analyze
3667 when trying to vectorize a basic-block mark the rest of the datarefs
3668 as not vectorizable and truncate the vector of datarefs. That
3669 avoids spending useless time in analyzing their dependence. */
3670 if (i != datarefs.length ())
3672 gcc_assert (bb_vinfo != NULL);
3673 for (unsigned j = i; j < datarefs.length (); ++j)
3675 data_reference_p dr = datarefs[j];
3676 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3677 free_data_ref (dr);
3679 datarefs.truncate (i);
3682 return true;
3686 /* Function vect_get_new_vect_var.
3688 Returns a name for a new variable. The current naming scheme appends the
3689 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3690 the name of vectorizer generated variables, and appends that to NAME if
3691 provided. */
3693 tree
3694 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3696 const char *prefix;
3697 tree new_vect_var;
3699 switch (var_kind)
3701 case vect_simple_var:
3702 prefix = "vect";
3703 break;
3704 case vect_scalar_var:
3705 prefix = "stmp";
3706 break;
3707 case vect_pointer_var:
3708 prefix = "vectp";
3709 break;
3710 default:
3711 gcc_unreachable ();
3714 if (name)
3716 char* tmp = concat (prefix, "_", name, NULL);
3717 new_vect_var = create_tmp_reg (type, tmp);
3718 free (tmp);
3720 else
3721 new_vect_var = create_tmp_reg (type, prefix);
3723 return new_vect_var;
3727 /* Function vect_create_addr_base_for_vector_ref.
3729 Create an expression that computes the address of the first memory location
3730 that will be accessed for a data reference.
3732 Input:
3733 STMT: The statement containing the data reference.
3734 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3735 OFFSET: Optional. If supplied, it is be added to the initial address.
3736 LOOP: Specify relative to which loop-nest should the address be computed.
3737 For example, when the dataref is in an inner-loop nested in an
3738 outer-loop that is now being vectorized, LOOP can be either the
3739 outer-loop, or the inner-loop. The first memory location accessed
3740 by the following dataref ('in' points to short):
3742 for (i=0; i<N; i++)
3743 for (j=0; j<M; j++)
3744 s += in[i+j]
3746 is as follows:
3747 if LOOP=i_loop: &in (relative to i_loop)
3748 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3750 Output:
3751 1. Return an SSA_NAME whose value is the address of the memory location of
3752 the first vector of the data reference.
3753 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3754 these statement(s) which define the returned SSA_NAME.
3756 FORNOW: We are only handling array accesses with step 1. */
3758 tree
3759 vect_create_addr_base_for_vector_ref (gimple stmt,
3760 gimple_seq *new_stmt_list,
3761 tree offset,
3762 struct loop *loop)
3764 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3765 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3766 tree data_ref_base;
3767 const char *base_name;
3768 tree addr_base;
3769 tree dest;
3770 gimple_seq seq = NULL;
3771 tree base_offset;
3772 tree init;
3773 tree vect_ptr_type;
3774 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3775 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3777 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3779 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3781 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3783 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3784 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3785 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3787 else
3789 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3790 base_offset = unshare_expr (DR_OFFSET (dr));
3791 init = unshare_expr (DR_INIT (dr));
3794 if (loop_vinfo)
3795 base_name = get_name (data_ref_base);
3796 else
3798 base_offset = ssize_int (0);
3799 init = ssize_int (0);
3800 base_name = get_name (DR_REF (dr));
3803 /* Create base_offset */
3804 base_offset = size_binop (PLUS_EXPR,
3805 fold_convert (sizetype, base_offset),
3806 fold_convert (sizetype, init));
3808 if (offset)
3810 offset = fold_build2 (MULT_EXPR, sizetype,
3811 fold_convert (sizetype, offset), step);
3812 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3813 base_offset, offset);
3816 /* base + base_offset */
3817 if (loop_vinfo)
3818 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3819 else
3821 addr_base = build1 (ADDR_EXPR,
3822 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3823 unshare_expr (DR_REF (dr)));
3826 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3827 addr_base = fold_convert (vect_ptr_type, addr_base);
3828 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3829 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3830 gimple_seq_add_seq (new_stmt_list, seq);
3832 if (DR_PTR_INFO (dr)
3833 && TREE_CODE (addr_base) == SSA_NAME)
3835 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3836 if (offset)
3837 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3840 if (dump_enabled_p ())
3842 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3843 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3844 dump_printf (MSG_NOTE, "\n");
3847 return addr_base;
3851 /* Function vect_create_data_ref_ptr.
3853 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3854 location accessed in the loop by STMT, along with the def-use update
3855 chain to appropriately advance the pointer through the loop iterations.
3856 Also set aliasing information for the pointer. This pointer is used by
3857 the callers to this function to create a memory reference expression for
3858 vector load/store access.
3860 Input:
3861 1. STMT: a stmt that references memory. Expected to be of the form
3862 GIMPLE_ASSIGN <name, data-ref> or
3863 GIMPLE_ASSIGN <data-ref, name>.
3864 2. AGGR_TYPE: the type of the reference, which should be either a vector
3865 or an array.
3866 3. AT_LOOP: the loop where the vector memref is to be created.
3867 4. OFFSET (optional): an offset to be added to the initial address accessed
3868 by the data-ref in STMT.
3869 5. BSI: location where the new stmts are to be placed if there is no loop
3870 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3871 pointing to the initial address.
3873 Output:
3874 1. Declare a new ptr to vector_type, and have it point to the base of the
3875 data reference (initial addressed accessed by the data reference).
3876 For example, for vector of type V8HI, the following code is generated:
3878 v8hi *ap;
3879 ap = (v8hi *)initial_address;
3881 if OFFSET is not supplied:
3882 initial_address = &a[init];
3883 if OFFSET is supplied:
3884 initial_address = &a[init + OFFSET];
3886 Return the initial_address in INITIAL_ADDRESS.
3888 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3889 update the pointer in each iteration of the loop.
3891 Return the increment stmt that updates the pointer in PTR_INCR.
3893 3. Set INV_P to true if the access pattern of the data reference in the
3894 vectorized loop is invariant. Set it to false otherwise.
3896 4. Return the pointer. */
3898 tree
3899 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3900 tree offset, tree *initial_address,
3901 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3902 bool only_init, bool *inv_p)
3904 const char *base_name;
3905 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3906 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3907 struct loop *loop = NULL;
3908 bool nested_in_vect_loop = false;
3909 struct loop *containing_loop = NULL;
3910 tree aggr_ptr_type;
3911 tree aggr_ptr;
3912 tree new_temp;
3913 gimple vec_stmt;
3914 gimple_seq new_stmt_list = NULL;
3915 edge pe = NULL;
3916 basic_block new_bb;
3917 tree aggr_ptr_init;
3918 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3919 tree aptr;
3920 gimple_stmt_iterator incr_gsi;
3921 bool insert_after;
3922 tree indx_before_incr, indx_after_incr;
3923 gimple incr;
3924 tree step;
3925 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3927 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3928 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3930 if (loop_vinfo)
3932 loop = LOOP_VINFO_LOOP (loop_vinfo);
3933 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3934 containing_loop = (gimple_bb (stmt))->loop_father;
3935 pe = loop_preheader_edge (loop);
3937 else
3939 gcc_assert (bb_vinfo);
3940 only_init = true;
3941 *ptr_incr = NULL;
3944 /* Check the step (evolution) of the load in LOOP, and record
3945 whether it's invariant. */
3946 if (nested_in_vect_loop)
3947 step = STMT_VINFO_DR_STEP (stmt_info);
3948 else
3949 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3951 if (integer_zerop (step))
3952 *inv_p = true;
3953 else
3954 *inv_p = false;
3956 /* Create an expression for the first address accessed by this load
3957 in LOOP. */
3958 base_name = get_name (DR_BASE_ADDRESS (dr));
3960 if (dump_enabled_p ())
3962 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
3963 dump_printf_loc (MSG_NOTE, vect_location,
3964 "create %s-pointer variable to type: ",
3965 get_tree_code_name (TREE_CODE (aggr_type)));
3966 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3967 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
3968 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
3969 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
3970 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
3971 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
3972 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
3973 else
3974 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3975 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
3976 dump_printf (MSG_NOTE, "\n");
3979 /* (1) Create the new aggregate-pointer variable.
3980 Vector and array types inherit the alias set of their component
3981 type by default so we need to use a ref-all pointer if the data
3982 reference does not conflict with the created aggregated data
3983 reference because it is not addressable. */
3984 bool need_ref_all = false;
3985 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3986 get_alias_set (DR_REF (dr))))
3987 need_ref_all = true;
3988 /* Likewise for any of the data references in the stmt group. */
3989 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
3991 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
3994 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
3995 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
3996 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3997 get_alias_set (DR_REF (sdr))))
3999 need_ref_all = true;
4000 break;
4002 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4004 while (orig_stmt);
4006 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4007 need_ref_all);
4008 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4011 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4012 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4013 def-use update cycles for the pointer: one relative to the outer-loop
4014 (LOOP), which is what steps (3) and (4) below do. The other is relative
4015 to the inner-loop (which is the inner-most loop containing the dataref),
4016 and this is done be step (5) below.
4018 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4019 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4020 redundant. Steps (3),(4) create the following:
4022 vp0 = &base_addr;
4023 LOOP: vp1 = phi(vp0,vp2)
4026 vp2 = vp1 + step
4027 goto LOOP
4029 If there is an inner-loop nested in loop, then step (5) will also be
4030 applied, and an additional update in the inner-loop will be created:
4032 vp0 = &base_addr;
4033 LOOP: vp1 = phi(vp0,vp2)
4035 inner: vp3 = phi(vp1,vp4)
4036 vp4 = vp3 + inner_step
4037 if () goto inner
4039 vp2 = vp1 + step
4040 if () goto LOOP */
4042 /* (2) Calculate the initial address of the aggregate-pointer, and set
4043 the aggregate-pointer to point to it before the loop. */
4045 /* Create: (&(base[init_val+offset]) in the loop preheader. */
4047 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4048 offset, loop);
4049 if (new_stmt_list)
4051 if (pe)
4053 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4054 gcc_assert (!new_bb);
4056 else
4057 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4060 *initial_address = new_temp;
4062 /* Create: p = (aggr_type *) initial_base */
4063 if (TREE_CODE (new_temp) != SSA_NAME
4064 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4066 vec_stmt = gimple_build_assign (aggr_ptr,
4067 fold_convert (aggr_ptr_type, new_temp));
4068 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4069 /* Copy the points-to information if it exists. */
4070 if (DR_PTR_INFO (dr))
4071 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4072 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4073 if (pe)
4075 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4076 gcc_assert (!new_bb);
4078 else
4079 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4081 else
4082 aggr_ptr_init = new_temp;
4084 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4085 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4086 inner-loop nested in LOOP (during outer-loop vectorization). */
4088 /* No update in loop is required. */
4089 if (only_init && (!loop_vinfo || at_loop == loop))
4090 aptr = aggr_ptr_init;
4091 else
4093 /* The step of the aggregate pointer is the type size. */
4094 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4095 /* One exception to the above is when the scalar step of the load in
4096 LOOP is zero. In this case the step here is also zero. */
4097 if (*inv_p)
4098 iv_step = size_zero_node;
4099 else if (tree_int_cst_sgn (step) == -1)
4100 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4102 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4104 create_iv (aggr_ptr_init,
4105 fold_convert (aggr_ptr_type, iv_step),
4106 aggr_ptr, loop, &incr_gsi, insert_after,
4107 &indx_before_incr, &indx_after_incr);
4108 incr = gsi_stmt (incr_gsi);
4109 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4111 /* Copy the points-to information if it exists. */
4112 if (DR_PTR_INFO (dr))
4114 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4115 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4117 if (ptr_incr)
4118 *ptr_incr = incr;
4120 aptr = indx_before_incr;
4123 if (!nested_in_vect_loop || only_init)
4124 return aptr;
4127 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4128 nested in LOOP, if exists. */
4130 gcc_assert (nested_in_vect_loop);
4131 if (!only_init)
4133 standard_iv_increment_position (containing_loop, &incr_gsi,
4134 &insert_after);
4135 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4136 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4137 &indx_after_incr);
4138 incr = gsi_stmt (incr_gsi);
4139 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4141 /* Copy the points-to information if it exists. */
4142 if (DR_PTR_INFO (dr))
4144 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4145 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4147 if (ptr_incr)
4148 *ptr_incr = incr;
4150 return indx_before_incr;
4152 else
4153 gcc_unreachable ();
4157 /* Function bump_vector_ptr
4159 Increment a pointer (to a vector type) by vector-size. If requested,
4160 i.e. if PTR-INCR is given, then also connect the new increment stmt
4161 to the existing def-use update-chain of the pointer, by modifying
4162 the PTR_INCR as illustrated below:
4164 The pointer def-use update-chain before this function:
4165 DATAREF_PTR = phi (p_0, p_2)
4166 ....
4167 PTR_INCR: p_2 = DATAREF_PTR + step
4169 The pointer def-use update-chain after this function:
4170 DATAREF_PTR = phi (p_0, p_2)
4171 ....
4172 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4173 ....
4174 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4176 Input:
4177 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4178 in the loop.
4179 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4180 the loop. The increment amount across iterations is expected
4181 to be vector_size.
4182 BSI - location where the new update stmt is to be placed.
4183 STMT - the original scalar memory-access stmt that is being vectorized.
4184 BUMP - optional. The offset by which to bump the pointer. If not given,
4185 the offset is assumed to be vector_size.
4187 Output: Return NEW_DATAREF_PTR as illustrated above.
4191 tree
4192 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4193 gimple stmt, tree bump)
4195 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4196 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4197 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4198 tree update = TYPE_SIZE_UNIT (vectype);
4199 gimple incr_stmt;
4200 ssa_op_iter iter;
4201 use_operand_p use_p;
4202 tree new_dataref_ptr;
4204 if (bump)
4205 update = bump;
4207 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4208 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4209 dataref_ptr, update);
4210 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4212 /* Copy the points-to information if it exists. */
4213 if (DR_PTR_INFO (dr))
4215 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4216 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4219 if (!ptr_incr)
4220 return new_dataref_ptr;
4222 /* Update the vector-pointer's cross-iteration increment. */
4223 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4225 tree use = USE_FROM_PTR (use_p);
4227 if (use == dataref_ptr)
4228 SET_USE (use_p, new_dataref_ptr);
4229 else
4230 gcc_assert (tree_int_cst_compare (use, update) == 0);
4233 return new_dataref_ptr;
4237 /* Function vect_create_destination_var.
4239 Create a new temporary of type VECTYPE. */
4241 tree
4242 vect_create_destination_var (tree scalar_dest, tree vectype)
4244 tree vec_dest;
4245 const char *name;
4246 char *new_name;
4247 tree type;
4248 enum vect_var_kind kind;
4250 kind = vectype ? vect_simple_var : vect_scalar_var;
4251 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4253 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4255 name = get_name (scalar_dest);
4256 if (name)
4257 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4258 else
4259 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
4260 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4261 free (new_name);
4263 return vec_dest;
4266 /* Function vect_grouped_store_supported.
4268 Returns TRUE if interleave high and interleave low permutations
4269 are supported, and FALSE otherwise. */
4271 bool
4272 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4274 enum machine_mode mode = TYPE_MODE (vectype);
4276 /* vect_permute_store_chain requires the group size to be a power of two. */
4277 if (exact_log2 (count) == -1)
4279 if (dump_enabled_p ())
4280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4281 "the size of the group of accesses"
4282 " is not a power of 2\n");
4283 return false;
4286 /* Check that the permutation is supported. */
4287 if (VECTOR_MODE_P (mode))
4289 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4290 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4291 for (i = 0; i < nelt / 2; i++)
4293 sel[i * 2] = i;
4294 sel[i * 2 + 1] = i + nelt;
4296 if (can_vec_perm_p (mode, false, sel))
4298 for (i = 0; i < nelt; i++)
4299 sel[i] += nelt / 2;
4300 if (can_vec_perm_p (mode, false, sel))
4301 return true;
4305 if (dump_enabled_p ())
4306 dump_printf (MSG_MISSED_OPTIMIZATION,
4307 "interleave op not supported by target.\n");
4308 return false;
4312 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4313 type VECTYPE. */
4315 bool
4316 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4318 return vect_lanes_optab_supported_p ("vec_store_lanes",
4319 vec_store_lanes_optab,
4320 vectype, count);
4324 /* Function vect_permute_store_chain.
4326 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4327 a power of 2, generate interleave_high/low stmts to reorder the data
4328 correctly for the stores. Return the final references for stores in
4329 RESULT_CHAIN.
4331 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4332 The input is 4 vectors each containing 8 elements. We assign a number to
4333 each element, the input sequence is:
4335 1st vec: 0 1 2 3 4 5 6 7
4336 2nd vec: 8 9 10 11 12 13 14 15
4337 3rd vec: 16 17 18 19 20 21 22 23
4338 4th vec: 24 25 26 27 28 29 30 31
4340 The output sequence should be:
4342 1st vec: 0 8 16 24 1 9 17 25
4343 2nd vec: 2 10 18 26 3 11 19 27
4344 3rd vec: 4 12 20 28 5 13 21 30
4345 4th vec: 6 14 22 30 7 15 23 31
4347 i.e., we interleave the contents of the four vectors in their order.
4349 We use interleave_high/low instructions to create such output. The input of
4350 each interleave_high/low operation is two vectors:
4351 1st vec 2nd vec
4352 0 1 2 3 4 5 6 7
4353 the even elements of the result vector are obtained left-to-right from the
4354 high/low elements of the first vector. The odd elements of the result are
4355 obtained left-to-right from the high/low elements of the second vector.
4356 The output of interleave_high will be: 0 4 1 5
4357 and of interleave_low: 2 6 3 7
4360 The permutation is done in log LENGTH stages. In each stage interleave_high
4361 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4362 where the first argument is taken from the first half of DR_CHAIN and the
4363 second argument from it's second half.
4364 In our example,
4366 I1: interleave_high (1st vec, 3rd vec)
4367 I2: interleave_low (1st vec, 3rd vec)
4368 I3: interleave_high (2nd vec, 4th vec)
4369 I4: interleave_low (2nd vec, 4th vec)
4371 The output for the first stage is:
4373 I1: 0 16 1 17 2 18 3 19
4374 I2: 4 20 5 21 6 22 7 23
4375 I3: 8 24 9 25 10 26 11 27
4376 I4: 12 28 13 29 14 30 15 31
4378 The output of the second stage, i.e. the final result is:
4380 I1: 0 8 16 24 1 9 17 25
4381 I2: 2 10 18 26 3 11 19 27
4382 I3: 4 12 20 28 5 13 21 30
4383 I4: 6 14 22 30 7 15 23 31. */
4385 void
4386 vect_permute_store_chain (vec<tree> dr_chain,
4387 unsigned int length,
4388 gimple stmt,
4389 gimple_stmt_iterator *gsi,
4390 vec<tree> *result_chain)
4392 tree vect1, vect2, high, low;
4393 gimple perm_stmt;
4394 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4395 tree perm_mask_low, perm_mask_high;
4396 unsigned int i, n;
4397 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4398 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4400 result_chain->quick_grow (length);
4401 memcpy (result_chain->address (), dr_chain.address (),
4402 length * sizeof (tree));
4404 for (i = 0, n = nelt / 2; i < n; i++)
4406 sel[i * 2] = i;
4407 sel[i * 2 + 1] = i + nelt;
4409 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4410 gcc_assert (perm_mask_high != NULL);
4412 for (i = 0; i < nelt; i++)
4413 sel[i] += nelt / 2;
4414 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4415 gcc_assert (perm_mask_low != NULL);
4417 for (i = 0, n = exact_log2 (length); i < n; i++)
4419 for (j = 0; j < length/2; j++)
4421 vect1 = dr_chain[j];
4422 vect2 = dr_chain[j+length/2];
4424 /* Create interleaving stmt:
4425 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4426 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4427 perm_stmt
4428 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4429 vect1, vect2, perm_mask_high);
4430 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4431 (*result_chain)[2*j] = high;
4433 /* Create interleaving stmt:
4434 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4435 nelt*3/2+1, ...}> */
4436 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4437 perm_stmt
4438 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4439 vect1, vect2, perm_mask_low);
4440 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4441 (*result_chain)[2*j+1] = low;
4443 memcpy (dr_chain.address (), result_chain->address (),
4444 length * sizeof (tree));
4448 /* Function vect_setup_realignment
4450 This function is called when vectorizing an unaligned load using
4451 the dr_explicit_realign[_optimized] scheme.
4452 This function generates the following code at the loop prolog:
4454 p = initial_addr;
4455 x msq_init = *(floor(p)); # prolog load
4456 realignment_token = call target_builtin;
4457 loop:
4458 x msq = phi (msq_init, ---)
4460 The stmts marked with x are generated only for the case of
4461 dr_explicit_realign_optimized.
4463 The code above sets up a new (vector) pointer, pointing to the first
4464 location accessed by STMT, and a "floor-aligned" load using that pointer.
4465 It also generates code to compute the "realignment-token" (if the relevant
4466 target hook was defined), and creates a phi-node at the loop-header bb
4467 whose arguments are the result of the prolog-load (created by this
4468 function) and the result of a load that takes place in the loop (to be
4469 created by the caller to this function).
4471 For the case of dr_explicit_realign_optimized:
4472 The caller to this function uses the phi-result (msq) to create the
4473 realignment code inside the loop, and sets up the missing phi argument,
4474 as follows:
4475 loop:
4476 msq = phi (msq_init, lsq)
4477 lsq = *(floor(p')); # load in loop
4478 result = realign_load (msq, lsq, realignment_token);
4480 For the case of dr_explicit_realign:
4481 loop:
4482 msq = *(floor(p)); # load in loop
4483 p' = p + (VS-1);
4484 lsq = *(floor(p')); # load in loop
4485 result = realign_load (msq, lsq, realignment_token);
4487 Input:
4488 STMT - (scalar) load stmt to be vectorized. This load accesses
4489 a memory location that may be unaligned.
4490 BSI - place where new code is to be inserted.
4491 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4492 is used.
4494 Output:
4495 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4496 target hook, if defined.
4497 Return value - the result of the loop-header phi node. */
4499 tree
4500 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4501 tree *realignment_token,
4502 enum dr_alignment_support alignment_support_scheme,
4503 tree init_addr,
4504 struct loop **at_loop)
4506 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4507 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4508 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4509 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4510 struct loop *loop = NULL;
4511 edge pe = NULL;
4512 tree scalar_dest = gimple_assign_lhs (stmt);
4513 tree vec_dest;
4514 gimple inc;
4515 tree ptr;
4516 tree data_ref;
4517 gimple new_stmt;
4518 basic_block new_bb;
4519 tree msq_init = NULL_TREE;
4520 tree new_temp;
4521 gimple phi_stmt;
4522 tree msq = NULL_TREE;
4523 gimple_seq stmts = NULL;
4524 bool inv_p;
4525 bool compute_in_loop = false;
4526 bool nested_in_vect_loop = false;
4527 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4528 struct loop *loop_for_initial_load = NULL;
4530 if (loop_vinfo)
4532 loop = LOOP_VINFO_LOOP (loop_vinfo);
4533 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4536 gcc_assert (alignment_support_scheme == dr_explicit_realign
4537 || alignment_support_scheme == dr_explicit_realign_optimized);
4539 /* We need to generate three things:
4540 1. the misalignment computation
4541 2. the extra vector load (for the optimized realignment scheme).
4542 3. the phi node for the two vectors from which the realignment is
4543 done (for the optimized realignment scheme). */
4545 /* 1. Determine where to generate the misalignment computation.
4547 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4548 calculation will be generated by this function, outside the loop (in the
4549 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4550 caller, inside the loop.
4552 Background: If the misalignment remains fixed throughout the iterations of
4553 the loop, then both realignment schemes are applicable, and also the
4554 misalignment computation can be done outside LOOP. This is because we are
4555 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4556 are a multiple of VS (the Vector Size), and therefore the misalignment in
4557 different vectorized LOOP iterations is always the same.
4558 The problem arises only if the memory access is in an inner-loop nested
4559 inside LOOP, which is now being vectorized using outer-loop vectorization.
4560 This is the only case when the misalignment of the memory access may not
4561 remain fixed throughout the iterations of the inner-loop (as explained in
4562 detail in vect_supportable_dr_alignment). In this case, not only is the
4563 optimized realignment scheme not applicable, but also the misalignment
4564 computation (and generation of the realignment token that is passed to
4565 REALIGN_LOAD) have to be done inside the loop.
4567 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4568 or not, which in turn determines if the misalignment is computed inside
4569 the inner-loop, or outside LOOP. */
4571 if (init_addr != NULL_TREE || !loop_vinfo)
4573 compute_in_loop = true;
4574 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4578 /* 2. Determine where to generate the extra vector load.
4580 For the optimized realignment scheme, instead of generating two vector
4581 loads in each iteration, we generate a single extra vector load in the
4582 preheader of the loop, and in each iteration reuse the result of the
4583 vector load from the previous iteration. In case the memory access is in
4584 an inner-loop nested inside LOOP, which is now being vectorized using
4585 outer-loop vectorization, we need to determine whether this initial vector
4586 load should be generated at the preheader of the inner-loop, or can be
4587 generated at the preheader of LOOP. If the memory access has no evolution
4588 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4589 to be generated inside LOOP (in the preheader of the inner-loop). */
4591 if (nested_in_vect_loop)
4593 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4594 bool invariant_in_outerloop =
4595 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4596 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4598 else
4599 loop_for_initial_load = loop;
4600 if (at_loop)
4601 *at_loop = loop_for_initial_load;
4603 if (loop_for_initial_load)
4604 pe = loop_preheader_edge (loop_for_initial_load);
4606 /* 3. For the case of the optimized realignment, create the first vector
4607 load at the loop preheader. */
4609 if (alignment_support_scheme == dr_explicit_realign_optimized)
4611 /* Create msq_init = *(floor(p1)) in the loop preheader */
4613 gcc_assert (!compute_in_loop);
4614 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4615 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4616 NULL_TREE, &init_addr, NULL, &inc,
4617 true, &inv_p);
4618 new_temp = copy_ssa_name (ptr, NULL);
4619 new_stmt = gimple_build_assign_with_ops
4620 (BIT_AND_EXPR, new_temp, ptr,
4621 build_int_cst (TREE_TYPE (ptr),
4622 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4623 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4624 gcc_assert (!new_bb);
4625 data_ref
4626 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4627 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4628 new_stmt = gimple_build_assign (vec_dest, data_ref);
4629 new_temp = make_ssa_name (vec_dest, new_stmt);
4630 gimple_assign_set_lhs (new_stmt, new_temp);
4631 if (pe)
4633 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4634 gcc_assert (!new_bb);
4636 else
4637 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4639 msq_init = gimple_assign_lhs (new_stmt);
4642 /* 4. Create realignment token using a target builtin, if available.
4643 It is done either inside the containing loop, or before LOOP (as
4644 determined above). */
4646 if (targetm.vectorize.builtin_mask_for_load)
4648 tree builtin_decl;
4650 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4651 if (!init_addr)
4653 /* Generate the INIT_ADDR computation outside LOOP. */
4654 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4655 NULL_TREE, loop);
4656 if (loop)
4658 pe = loop_preheader_edge (loop);
4659 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4660 gcc_assert (!new_bb);
4662 else
4663 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4666 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4667 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4668 vec_dest =
4669 vect_create_destination_var (scalar_dest,
4670 gimple_call_return_type (new_stmt));
4671 new_temp = make_ssa_name (vec_dest, new_stmt);
4672 gimple_call_set_lhs (new_stmt, new_temp);
4674 if (compute_in_loop)
4675 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4676 else
4678 /* Generate the misalignment computation outside LOOP. */
4679 pe = loop_preheader_edge (loop);
4680 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4681 gcc_assert (!new_bb);
4684 *realignment_token = gimple_call_lhs (new_stmt);
4686 /* The result of the CALL_EXPR to this builtin is determined from
4687 the value of the parameter and no global variables are touched
4688 which makes the builtin a "const" function. Requiring the
4689 builtin to have the "const" attribute makes it unnecessary
4690 to call mark_call_clobbered. */
4691 gcc_assert (TREE_READONLY (builtin_decl));
4694 if (alignment_support_scheme == dr_explicit_realign)
4695 return msq;
4697 gcc_assert (!compute_in_loop);
4698 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4701 /* 5. Create msq = phi <msq_init, lsq> in loop */
4703 pe = loop_preheader_edge (containing_loop);
4704 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4705 msq = make_ssa_name (vec_dest, NULL);
4706 phi_stmt = create_phi_node (msq, containing_loop->header);
4707 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4709 return msq;
4713 /* Function vect_grouped_load_supported.
4715 Returns TRUE if even and odd permutations are supported,
4716 and FALSE otherwise. */
4718 bool
4719 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4721 enum machine_mode mode = TYPE_MODE (vectype);
4723 /* vect_permute_load_chain requires the group size to be a power of two. */
4724 if (exact_log2 (count) == -1)
4726 if (dump_enabled_p ())
4727 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4728 "the size of the group of accesses"
4729 " is not a power of 2\n");
4730 return false;
4733 /* Check that the permutation is supported. */
4734 if (VECTOR_MODE_P (mode))
4736 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4737 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4739 for (i = 0; i < nelt; i++)
4740 sel[i] = i * 2;
4741 if (can_vec_perm_p (mode, false, sel))
4743 for (i = 0; i < nelt; i++)
4744 sel[i] = i * 2 + 1;
4745 if (can_vec_perm_p (mode, false, sel))
4746 return true;
4750 if (dump_enabled_p ())
4751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4752 "extract even/odd not supported by target\n");
4753 return false;
4756 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4757 type VECTYPE. */
4759 bool
4760 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4762 return vect_lanes_optab_supported_p ("vec_load_lanes",
4763 vec_load_lanes_optab,
4764 vectype, count);
4767 /* Function vect_permute_load_chain.
4769 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4770 a power of 2, generate extract_even/odd stmts to reorder the input data
4771 correctly. Return the final references for loads in RESULT_CHAIN.
4773 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4774 The input is 4 vectors each containing 8 elements. We assign a number to each
4775 element, the input sequence is:
4777 1st vec: 0 1 2 3 4 5 6 7
4778 2nd vec: 8 9 10 11 12 13 14 15
4779 3rd vec: 16 17 18 19 20 21 22 23
4780 4th vec: 24 25 26 27 28 29 30 31
4782 The output sequence should be:
4784 1st vec: 0 4 8 12 16 20 24 28
4785 2nd vec: 1 5 9 13 17 21 25 29
4786 3rd vec: 2 6 10 14 18 22 26 30
4787 4th vec: 3 7 11 15 19 23 27 31
4789 i.e., the first output vector should contain the first elements of each
4790 interleaving group, etc.
4792 We use extract_even/odd instructions to create such output. The input of
4793 each extract_even/odd operation is two vectors
4794 1st vec 2nd vec
4795 0 1 2 3 4 5 6 7
4797 and the output is the vector of extracted even/odd elements. The output of
4798 extract_even will be: 0 2 4 6
4799 and of extract_odd: 1 3 5 7
4802 The permutation is done in log LENGTH stages. In each stage extract_even
4803 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4804 their order. In our example,
4806 E1: extract_even (1st vec, 2nd vec)
4807 E2: extract_odd (1st vec, 2nd vec)
4808 E3: extract_even (3rd vec, 4th vec)
4809 E4: extract_odd (3rd vec, 4th vec)
4811 The output for the first stage will be:
4813 E1: 0 2 4 6 8 10 12 14
4814 E2: 1 3 5 7 9 11 13 15
4815 E3: 16 18 20 22 24 26 28 30
4816 E4: 17 19 21 23 25 27 29 31
4818 In order to proceed and create the correct sequence for the next stage (or
4819 for the correct output, if the second stage is the last one, as in our
4820 example), we first put the output of extract_even operation and then the
4821 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4822 The input for the second stage is:
4824 1st vec (E1): 0 2 4 6 8 10 12 14
4825 2nd vec (E3): 16 18 20 22 24 26 28 30
4826 3rd vec (E2): 1 3 5 7 9 11 13 15
4827 4th vec (E4): 17 19 21 23 25 27 29 31
4829 The output of the second stage:
4831 E1: 0 4 8 12 16 20 24 28
4832 E2: 2 6 10 14 18 22 26 30
4833 E3: 1 5 9 13 17 21 25 29
4834 E4: 3 7 11 15 19 23 27 31
4836 And RESULT_CHAIN after reordering:
4838 1st vec (E1): 0 4 8 12 16 20 24 28
4839 2nd vec (E3): 1 5 9 13 17 21 25 29
4840 3rd vec (E2): 2 6 10 14 18 22 26 30
4841 4th vec (E4): 3 7 11 15 19 23 27 31. */
4843 static void
4844 vect_permute_load_chain (vec<tree> dr_chain,
4845 unsigned int length,
4846 gimple stmt,
4847 gimple_stmt_iterator *gsi,
4848 vec<tree> *result_chain)
4850 tree data_ref, first_vect, second_vect;
4851 tree perm_mask_even, perm_mask_odd;
4852 gimple perm_stmt;
4853 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4854 unsigned int i, j, log_length = exact_log2 (length);
4855 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4856 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4858 result_chain->quick_grow (length);
4859 memcpy (result_chain->address (), dr_chain.address (),
4860 length * sizeof (tree));
4862 for (i = 0; i < nelt; ++i)
4863 sel[i] = i * 2;
4864 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4865 gcc_assert (perm_mask_even != NULL);
4867 for (i = 0; i < nelt; ++i)
4868 sel[i] = i * 2 + 1;
4869 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4870 gcc_assert (perm_mask_odd != NULL);
4872 for (i = 0; i < log_length; i++)
4874 for (j = 0; j < length; j += 2)
4876 first_vect = dr_chain[j];
4877 second_vect = dr_chain[j+1];
4879 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4880 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4881 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4882 first_vect, second_vect,
4883 perm_mask_even);
4884 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4885 (*result_chain)[j/2] = data_ref;
4887 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4888 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4889 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4890 first_vect, second_vect,
4891 perm_mask_odd);
4892 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4893 (*result_chain)[j/2+length/2] = data_ref;
4895 memcpy (dr_chain.address (), result_chain->address (),
4896 length * sizeof (tree));
4901 /* Function vect_transform_grouped_load.
4903 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4904 to perform their permutation and ascribe the result vectorized statements to
4905 the scalar statements.
4908 void
4909 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4910 gimple_stmt_iterator *gsi)
4912 vec<tree> result_chain = vNULL;
4914 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4915 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4916 vectors, that are ready for vector computation. */
4917 result_chain.create (size);
4918 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4919 vect_record_grouped_load_vectors (stmt, result_chain);
4920 result_chain.release ();
4923 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4924 generated as part of the vectorization of STMT. Assign the statement
4925 for each vector to the associated scalar statement. */
4927 void
4928 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4930 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4931 gimple next_stmt, new_stmt;
4932 unsigned int i, gap_count;
4933 tree tmp_data_ref;
4935 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4936 Since we scan the chain starting from it's first node, their order
4937 corresponds the order of data-refs in RESULT_CHAIN. */
4938 next_stmt = first_stmt;
4939 gap_count = 1;
4940 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
4942 if (!next_stmt)
4943 break;
4945 /* Skip the gaps. Loads created for the gaps will be removed by dead
4946 code elimination pass later. No need to check for the first stmt in
4947 the group, since it always exists.
4948 GROUP_GAP is the number of steps in elements from the previous
4949 access (if there is no gap GROUP_GAP is 1). We skip loads that
4950 correspond to the gaps. */
4951 if (next_stmt != first_stmt
4952 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4954 gap_count++;
4955 continue;
4958 while (next_stmt)
4960 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4961 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4962 copies, and we put the new vector statement in the first available
4963 RELATED_STMT. */
4964 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4965 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4966 else
4968 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4970 gimple prev_stmt =
4971 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4972 gimple rel_stmt =
4973 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
4974 while (rel_stmt)
4976 prev_stmt = rel_stmt;
4977 rel_stmt =
4978 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4981 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
4982 new_stmt;
4986 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
4987 gap_count = 1;
4988 /* If NEXT_STMT accesses the same DR as the previous statement,
4989 put the same TMP_DATA_REF as its vectorized statement; otherwise
4990 get the next data-ref from RESULT_CHAIN. */
4991 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4992 break;
4997 /* Function vect_force_dr_alignment_p.
4999 Returns whether the alignment of a DECL can be forced to be aligned
5000 on ALIGNMENT bit boundary. */
5002 bool
5003 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5005 if (TREE_CODE (decl) != VAR_DECL)
5006 return false;
5008 /* We cannot change alignment of common or external symbols as another
5009 translation unit may contain a definition with lower alignment.
5010 The rules of common symbol linking mean that the definition
5011 will override the common symbol. The same is true for constant
5012 pool entries which may be shared and are not properly merged
5013 by LTO. */
5014 if (DECL_EXTERNAL (decl)
5015 || DECL_COMMON (decl)
5016 || DECL_IN_CONSTANT_POOL (decl))
5017 return false;
5019 if (TREE_ASM_WRITTEN (decl))
5020 return false;
5022 /* Do not override the alignment as specified by the ABI when the used
5023 attribute is set. */
5024 if (DECL_PRESERVE_P (decl))
5025 return false;
5027 /* Do not override explicit alignment set by the user when an explicit
5028 section name is also used. This is a common idiom used by many
5029 software projects. */
5030 if (DECL_SECTION_NAME (decl) != NULL_TREE
5031 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
5032 return false;
5034 if (TREE_STATIC (decl))
5035 return (alignment <= MAX_OFILE_ALIGNMENT);
5036 else
5037 return (alignment <= MAX_STACK_ALIGNMENT);
5041 /* Return whether the data reference DR is supported with respect to its
5042 alignment.
5043 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5044 it is aligned, i.e., check if it is possible to vectorize it with different
5045 alignment. */
5047 enum dr_alignment_support
5048 vect_supportable_dr_alignment (struct data_reference *dr,
5049 bool check_aligned_accesses)
5051 gimple stmt = DR_STMT (dr);
5052 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5053 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5054 enum machine_mode mode = TYPE_MODE (vectype);
5055 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5056 struct loop *vect_loop = NULL;
5057 bool nested_in_vect_loop = false;
5059 if (aligned_access_p (dr) && !check_aligned_accesses)
5060 return dr_aligned;
5062 if (loop_vinfo)
5064 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5065 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5068 /* Possibly unaligned access. */
5070 /* We can choose between using the implicit realignment scheme (generating
5071 a misaligned_move stmt) and the explicit realignment scheme (generating
5072 aligned loads with a REALIGN_LOAD). There are two variants to the
5073 explicit realignment scheme: optimized, and unoptimized.
5074 We can optimize the realignment only if the step between consecutive
5075 vector loads is equal to the vector size. Since the vector memory
5076 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5077 is guaranteed that the misalignment amount remains the same throughout the
5078 execution of the vectorized loop. Therefore, we can create the
5079 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5080 at the loop preheader.
5082 However, in the case of outer-loop vectorization, when vectorizing a
5083 memory access in the inner-loop nested within the LOOP that is now being
5084 vectorized, while it is guaranteed that the misalignment of the
5085 vectorized memory access will remain the same in different outer-loop
5086 iterations, it is *not* guaranteed that is will remain the same throughout
5087 the execution of the inner-loop. This is because the inner-loop advances
5088 with the original scalar step (and not in steps of VS). If the inner-loop
5089 step happens to be a multiple of VS, then the misalignment remains fixed
5090 and we can use the optimized realignment scheme. For example:
5092 for (i=0; i<N; i++)
5093 for (j=0; j<M; j++)
5094 s += a[i+j];
5096 When vectorizing the i-loop in the above example, the step between
5097 consecutive vector loads is 1, and so the misalignment does not remain
5098 fixed across the execution of the inner-loop, and the realignment cannot
5099 be optimized (as illustrated in the following pseudo vectorized loop):
5101 for (i=0; i<N; i+=4)
5102 for (j=0; j<M; j++){
5103 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5104 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5105 // (assuming that we start from an aligned address).
5108 We therefore have to use the unoptimized realignment scheme:
5110 for (i=0; i<N; i+=4)
5111 for (j=k; j<M; j+=4)
5112 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5113 // that the misalignment of the initial address is
5114 // 0).
5116 The loop can then be vectorized as follows:
5118 for (k=0; k<4; k++){
5119 rt = get_realignment_token (&vp[k]);
5120 for (i=0; i<N; i+=4){
5121 v1 = vp[i+k];
5122 for (j=k; j<M; j+=4){
5123 v2 = vp[i+j+VS-1];
5124 va = REALIGN_LOAD <v1,v2,rt>;
5125 vs += va;
5126 v1 = v2;
5129 } */
5131 if (DR_IS_READ (dr))
5133 bool is_packed = false;
5134 tree type = (TREE_TYPE (DR_REF (dr)));
5136 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5137 && (!targetm.vectorize.builtin_mask_for_load
5138 || targetm.vectorize.builtin_mask_for_load ()))
5140 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5141 if ((nested_in_vect_loop
5142 && (TREE_INT_CST_LOW (DR_STEP (dr))
5143 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5144 || !loop_vinfo)
5145 return dr_explicit_realign;
5146 else
5147 return dr_explicit_realign_optimized;
5149 if (!known_alignment_for_access_p (dr))
5150 is_packed = not_size_aligned (DR_REF (dr));
5152 if ((TYPE_USER_ALIGN (type) && !is_packed)
5153 || targetm.vectorize.
5154 support_vector_misalignment (mode, type,
5155 DR_MISALIGNMENT (dr), is_packed))
5156 /* Can't software pipeline the loads, but can at least do them. */
5157 return dr_unaligned_supported;
5159 else
5161 bool is_packed = false;
5162 tree type = (TREE_TYPE (DR_REF (dr)));
5164 if (!known_alignment_for_access_p (dr))
5165 is_packed = not_size_aligned (DR_REF (dr));
5167 if ((TYPE_USER_ALIGN (type) && !is_packed)
5168 || targetm.vectorize.
5169 support_vector_misalignment (mode, type,
5170 DR_MISALIGNMENT (dr), is_packed))
5171 return dr_unaligned_supported;
5174 /* Unsupported. */
5175 return dr_unaligned_unsupported;