2013-11-27 Richard Biener <rguenther@suse.de>
[official-gcc.git] / gcc / tree-vect-data-refs.c
blob82616450e1e82460def7175297dd8f7d0e6917a4
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "tree.h"
28 #include "stor-layout.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-ssa-alias.h"
34 #include "internal-fn.h"
35 #include "tree-eh.h"
36 #include "gimple-expr.h"
37 #include "is-a.h"
38 #include "gimple.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "gimple-ssa.h"
43 #include "tree-phinodes.h"
44 #include "ssa-iterators.h"
45 #include "stringpool.h"
46 #include "tree-ssanames.h"
47 #include "tree-ssa-loop-ivopts.h"
48 #include "tree-ssa-loop-manip.h"
49 #include "tree-ssa-loop.h"
50 #include "dumpfile.h"
51 #include "cfgloop.h"
52 #include "tree-chrec.h"
53 #include "tree-scalar-evolution.h"
54 #include "tree-vectorizer.h"
55 #include "diagnostic-core.h"
56 /* Need to include rtl.h, expr.h, etc. for optabs. */
57 #include "expr.h"
58 #include "optabs.h"
60 /* Return true if load- or store-lanes optab OPTAB is implemented for
61 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
63 static bool
64 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
65 tree vectype, unsigned HOST_WIDE_INT count)
67 enum machine_mode mode, array_mode;
68 bool limit_p;
70 mode = TYPE_MODE (vectype);
71 limit_p = !targetm.array_mode_supported_p (mode, count);
72 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
73 MODE_INT, limit_p);
75 if (array_mode == BLKmode)
77 if (dump_enabled_p ())
78 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
79 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
80 GET_MODE_NAME (mode), count);
81 return false;
84 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
88 "cannot use %s<%s><%s>\n", name,
89 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
90 return false;
93 if (dump_enabled_p ())
94 dump_printf_loc (MSG_NOTE, vect_location,
95 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
96 GET_MODE_NAME (mode));
98 return true;
102 /* Return the smallest scalar part of STMT.
103 This is used to determine the vectype of the stmt. We generally set the
104 vectype according to the type of the result (lhs). For stmts whose
105 result-type is different than the type of the arguments (e.g., demotion,
106 promotion), vectype will be reset appropriately (later). Note that we have
107 to visit the smallest datatype in this function, because that determines the
108 VF. If the smallest datatype in the loop is present only as the rhs of a
109 promotion operation - we'd miss it.
110 Such a case, where a variable of this datatype does not appear in the lhs
111 anywhere in the loop, can only occur if it's an invariant: e.g.:
112 'int_x = (int) short_inv', which we'd expect to have been optimized away by
113 invariant motion. However, we cannot rely on invariant motion to always
114 take invariants out of the loop, and so in the case of promotion we also
115 have to check the rhs.
116 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
117 types. */
119 tree
120 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
121 HOST_WIDE_INT *rhs_size_unit)
123 tree scalar_type = gimple_expr_type (stmt);
124 HOST_WIDE_INT lhs, rhs;
126 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
128 if (is_gimple_assign (stmt)
129 && (gimple_assign_cast_p (stmt)
130 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
131 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
132 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
134 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
136 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
137 if (rhs < lhs)
138 scalar_type = rhs_type;
141 *lhs_size_unit = lhs;
142 *rhs_size_unit = rhs;
143 return scalar_type;
147 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
148 tested at run-time. Return TRUE if DDR was successfully inserted.
149 Return false if versioning is not supported. */
151 static bool
152 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
154 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
156 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
157 return false;
159 if (dump_enabled_p ())
161 dump_printf_loc (MSG_NOTE, vect_location,
162 "mark for run-time aliasing test between ");
163 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
164 dump_printf (MSG_NOTE, " and ");
165 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
166 dump_printf (MSG_NOTE, "\n");
169 if (optimize_loop_nest_for_size_p (loop))
171 if (dump_enabled_p ())
172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
173 "versioning not supported when optimizing"
174 " for size.\n");
175 return false;
178 /* FORNOW: We don't support versioning with outer-loop vectorization. */
179 if (loop->inner)
181 if (dump_enabled_p ())
182 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
183 "versioning not yet supported for outer-loops.\n");
184 return false;
187 /* FORNOW: We don't support creating runtime alias tests for non-constant
188 step. */
189 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
190 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
192 if (dump_enabled_p ())
193 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
194 "versioning not yet supported for non-constant "
195 "step\n");
196 return false;
199 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
200 return true;
204 /* Function vect_analyze_data_ref_dependence.
206 Return TRUE if there (might) exist a dependence between a memory-reference
207 DRA and a memory-reference DRB. When versioning for alias may check a
208 dependence at run-time, return FALSE. Adjust *MAX_VF according to
209 the data dependence. */
211 static bool
212 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
213 loop_vec_info loop_vinfo, int *max_vf)
215 unsigned int i;
216 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
217 struct data_reference *dra = DDR_A (ddr);
218 struct data_reference *drb = DDR_B (ddr);
219 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
220 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
221 lambda_vector dist_v;
222 unsigned int loop_depth;
224 /* In loop analysis all data references should be vectorizable. */
225 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
226 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
227 gcc_unreachable ();
229 /* Independent data accesses. */
230 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
231 return false;
233 if (dra == drb
234 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
235 return false;
237 /* Unknown data dependence. */
238 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
240 /* If user asserted safelen consecutive iterations can be
241 executed concurrently, assume independence. */
242 if (loop->safelen >= 2)
244 if (loop->safelen < *max_vf)
245 *max_vf = loop->safelen;
246 return false;
249 if (STMT_VINFO_GATHER_P (stmtinfo_a)
250 || STMT_VINFO_GATHER_P (stmtinfo_b))
252 if (dump_enabled_p ())
254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
255 "versioning for alias not supported for: "
256 "can't determine dependence between ");
257 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
258 DR_REF (dra));
259 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
260 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
261 DR_REF (drb));
262 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
264 return true;
267 if (dump_enabled_p ())
269 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
270 "versioning for alias required: "
271 "can't determine dependence between ");
272 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
273 DR_REF (dra));
274 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
275 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
276 DR_REF (drb));
277 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
280 /* Add to list of ddrs that need to be tested at run-time. */
281 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
284 /* Known data dependence. */
285 if (DDR_NUM_DIST_VECTS (ddr) == 0)
287 /* If user asserted safelen consecutive iterations can be
288 executed concurrently, assume independence. */
289 if (loop->safelen >= 2)
291 if (loop->safelen < *max_vf)
292 *max_vf = loop->safelen;
293 return false;
296 if (STMT_VINFO_GATHER_P (stmtinfo_a)
297 || STMT_VINFO_GATHER_P (stmtinfo_b))
299 if (dump_enabled_p ())
301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
302 "versioning for alias not supported for: "
303 "bad dist vector for ");
304 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
305 DR_REF (dra));
306 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
307 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
308 DR_REF (drb));
309 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
311 return true;
314 if (dump_enabled_p ())
316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
317 "versioning for alias required: "
318 "bad dist vector for ");
319 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
320 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
321 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
322 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
324 /* Add to list of ddrs that need to be tested at run-time. */
325 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
328 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
329 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
331 int dist = dist_v[loop_depth];
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location,
335 "dependence distance = %d.\n", dist);
337 if (dist == 0)
339 if (dump_enabled_p ())
341 dump_printf_loc (MSG_NOTE, vect_location,
342 "dependence distance == 0 between ");
343 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
344 dump_printf (MSG_NOTE, " and ");
345 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
346 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
349 /* When we perform grouped accesses and perform implicit CSE
350 by detecting equal accesses and doing disambiguation with
351 runtime alias tests like for
352 .. = a[i];
353 .. = a[i+1];
354 a[i] = ..;
355 a[i+1] = ..;
356 *p = ..;
357 .. = a[i];
358 .. = a[i+1];
359 where we will end up loading { a[i], a[i+1] } once, make
360 sure that inserting group loads before the first load and
361 stores after the last store will do the right thing. */
362 if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
363 && GROUP_SAME_DR_STMT (stmtinfo_a))
364 || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
365 && GROUP_SAME_DR_STMT (stmtinfo_b)))
367 gimple earlier_stmt;
368 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
369 if (DR_IS_WRITE
370 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
372 if (dump_enabled_p ())
373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
374 "READ_WRITE dependence in interleaving."
375 "\n");
376 return true;
380 continue;
383 if (dist > 0 && DDR_REVERSED_P (ddr))
385 /* If DDR_REVERSED_P the order of the data-refs in DDR was
386 reversed (to make distance vector positive), and the actual
387 distance is negative. */
388 if (dump_enabled_p ())
389 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
390 "dependence distance negative.\n");
391 continue;
394 if (abs (dist) >= 2
395 && abs (dist) < *max_vf)
397 /* The dependence distance requires reduction of the maximal
398 vectorization factor. */
399 *max_vf = abs (dist);
400 if (dump_enabled_p ())
401 dump_printf_loc (MSG_NOTE, vect_location,
402 "adjusting maximal vectorization factor to %i\n",
403 *max_vf);
406 if (abs (dist) >= *max_vf)
408 /* Dependence distance does not create dependence, as far as
409 vectorization is concerned, in this case. */
410 if (dump_enabled_p ())
411 dump_printf_loc (MSG_NOTE, vect_location,
412 "dependence distance >= VF.\n");
413 continue;
416 if (dump_enabled_p ())
418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
419 "not vectorized, possible dependence "
420 "between data-refs ");
421 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
422 dump_printf (MSG_NOTE, " and ");
423 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
424 dump_printf (MSG_NOTE, "\n");
427 return true;
430 return false;
433 /* Function vect_analyze_data_ref_dependences.
435 Examine all the data references in the loop, and make sure there do not
436 exist any data dependences between them. Set *MAX_VF according to
437 the maximum vectorization factor the data dependences allow. */
439 bool
440 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
442 unsigned int i;
443 struct data_dependence_relation *ddr;
445 if (dump_enabled_p ())
446 dump_printf_loc (MSG_NOTE, vect_location,
447 "=== vect_analyze_data_ref_dependences ===\n");
449 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
450 &LOOP_VINFO_DDRS (loop_vinfo),
451 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
452 return false;
454 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
455 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
456 return false;
458 return true;
462 /* Function vect_slp_analyze_data_ref_dependence.
464 Return TRUE if there (might) exist a dependence between a memory-reference
465 DRA and a memory-reference DRB. When versioning for alias may check a
466 dependence at run-time, return FALSE. Adjust *MAX_VF according to
467 the data dependence. */
469 static bool
470 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
472 struct data_reference *dra = DDR_A (ddr);
473 struct data_reference *drb = DDR_B (ddr);
475 /* We need to check dependences of statements marked as unvectorizable
476 as well, they still can prohibit vectorization. */
478 /* Independent data accesses. */
479 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
480 return false;
482 if (dra == drb)
483 return false;
485 /* Read-read is OK. */
486 if (DR_IS_READ (dra) && DR_IS_READ (drb))
487 return false;
489 /* If dra and drb are part of the same interleaving chain consider
490 them independent. */
491 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
492 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
493 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
494 return false;
496 /* Unknown data dependence. */
497 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
499 gimple earlier_stmt;
501 if (dump_enabled_p ())
503 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
504 "can't determine dependence between ");
505 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
506 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
507 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
508 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
511 /* We do not vectorize basic blocks with write-write dependencies. */
512 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
513 return true;
515 /* Check that it's not a load-after-store dependence. */
516 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
517 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
518 return true;
520 return false;
523 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location,
526 "determined dependence between ");
527 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
528 dump_printf (MSG_NOTE, " and ");
529 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
530 dump_printf (MSG_NOTE, "\n");
533 /* Do not vectorize basic blocks with write-write dependences. */
534 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
535 return true;
537 /* Check dependence between DRA and DRB for basic block vectorization.
538 If the accesses share same bases and offsets, we can compare their initial
539 constant offsets to decide whether they differ or not. In case of a read-
540 write dependence we check that the load is before the store to ensure that
541 vectorization will not change the order of the accesses. */
543 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
544 gimple earlier_stmt;
546 /* Check that the data-refs have same bases and offsets. If not, we can't
547 determine if they are dependent. */
548 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
549 || !dr_equal_offsets_p (dra, drb))
550 return true;
552 /* Check the types. */
553 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
554 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
556 if (type_size_a != type_size_b
557 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
558 TREE_TYPE (DR_REF (drb))))
559 return true;
561 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
562 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
564 /* Two different locations - no dependence. */
565 if (init_a != init_b)
566 return false;
568 /* We have a read-write dependence. Check that the load is before the store.
569 When we vectorize basic blocks, vector load can be only before
570 corresponding scalar load, and vector store can be only after its
571 corresponding scalar store. So the order of the acceses is preserved in
572 case the load is before the store. */
573 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
574 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
575 return false;
577 return true;
581 /* Function vect_analyze_data_ref_dependences.
583 Examine all the data references in the basic-block, and make sure there
584 do not exist any data dependences between them. Set *MAX_VF according to
585 the maximum vectorization factor the data dependences allow. */
587 bool
588 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
590 struct data_dependence_relation *ddr;
591 unsigned int i;
593 if (dump_enabled_p ())
594 dump_printf_loc (MSG_NOTE, vect_location,
595 "=== vect_slp_analyze_data_ref_dependences ===\n");
597 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
598 &BB_VINFO_DDRS (bb_vinfo),
599 vNULL, true))
600 return false;
602 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
603 if (vect_slp_analyze_data_ref_dependence (ddr))
604 return false;
606 return true;
610 /* Function vect_compute_data_ref_alignment
612 Compute the misalignment of the data reference DR.
614 Output:
615 1. If during the misalignment computation it is found that the data reference
616 cannot be vectorized then false is returned.
617 2. DR_MISALIGNMENT (DR) is defined.
619 FOR NOW: No analysis is actually performed. Misalignment is calculated
620 only for trivial cases. TODO. */
622 static bool
623 vect_compute_data_ref_alignment (struct data_reference *dr)
625 gimple stmt = DR_STMT (dr);
626 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
627 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
628 struct loop *loop = NULL;
629 tree ref = DR_REF (dr);
630 tree vectype;
631 tree base, base_addr;
632 bool base_aligned;
633 tree misalign;
634 tree aligned_to, alignment;
636 if (dump_enabled_p ())
637 dump_printf_loc (MSG_NOTE, vect_location,
638 "vect_compute_data_ref_alignment:\n");
640 if (loop_vinfo)
641 loop = LOOP_VINFO_LOOP (loop_vinfo);
643 /* Initialize misalignment to unknown. */
644 SET_DR_MISALIGNMENT (dr, -1);
646 /* Strided loads perform only component accesses, misalignment information
647 is irrelevant for them. */
648 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
649 return true;
651 misalign = DR_INIT (dr);
652 aligned_to = DR_ALIGNED_TO (dr);
653 base_addr = DR_BASE_ADDRESS (dr);
654 vectype = STMT_VINFO_VECTYPE (stmt_info);
656 /* In case the dataref is in an inner-loop of the loop that is being
657 vectorized (LOOP), we use the base and misalignment information
658 relative to the outer-loop (LOOP). This is ok only if the misalignment
659 stays the same throughout the execution of the inner-loop, which is why
660 we have to check that the stride of the dataref in the inner-loop evenly
661 divides by the vector size. */
662 if (loop && nested_in_vect_loop_p (loop, stmt))
664 tree step = DR_STEP (dr);
665 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
667 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
669 if (dump_enabled_p ())
670 dump_printf_loc (MSG_NOTE, vect_location,
671 "inner step divides the vector-size.\n");
672 misalign = STMT_VINFO_DR_INIT (stmt_info);
673 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
674 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
676 else
678 if (dump_enabled_p ())
679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
680 "inner step doesn't divide the vector-size.\n");
681 misalign = NULL_TREE;
685 /* Similarly, if we're doing basic-block vectorization, we can only use
686 base and misalignment information relative to an innermost loop if the
687 misalignment stays the same throughout the execution of the loop.
688 As above, this is the case if the stride of the dataref evenly divides
689 by the vector size. */
690 if (!loop)
692 tree step = DR_STEP (dr);
693 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
695 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
697 if (dump_enabled_p ())
698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
699 "SLP: step doesn't divide the vector-size.\n");
700 misalign = NULL_TREE;
704 base = build_fold_indirect_ref (base_addr);
705 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
707 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
708 || !misalign)
710 if (dump_enabled_p ())
712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
713 "Unknown alignment for access: ");
714 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
715 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
717 return true;
720 if ((DECL_P (base)
721 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
722 alignment) >= 0)
723 || (TREE_CODE (base_addr) == SSA_NAME
724 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
725 TREE_TYPE (base_addr)))),
726 alignment) >= 0)
727 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
728 base_aligned = true;
729 else
730 base_aligned = false;
732 if (!base_aligned)
734 /* Do not change the alignment of global variables here if
735 flag_section_anchors is enabled as we already generated
736 RTL for other functions. Most global variables should
737 have been aligned during the IPA increase_alignment pass. */
738 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
739 || (TREE_STATIC (base) && flag_section_anchors))
741 if (dump_enabled_p ())
743 dump_printf_loc (MSG_NOTE, vect_location,
744 "can't force alignment of ref: ");
745 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
746 dump_printf (MSG_NOTE, "\n");
748 return true;
751 /* Force the alignment of the decl.
752 NOTE: This is the only change to the code we make during
753 the analysis phase, before deciding to vectorize the loop. */
754 if (dump_enabled_p ())
756 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
757 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
758 dump_printf (MSG_NOTE, "\n");
761 ((dataref_aux *)dr->aux)->base_decl = base;
762 ((dataref_aux *)dr->aux)->base_misaligned = true;
765 /* If this is a backward running DR then first access in the larger
766 vectype actually is N-1 elements before the address in the DR.
767 Adjust misalign accordingly. */
768 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
770 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
771 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
772 otherwise we wouldn't be here. */
773 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
774 /* PLUS because DR_STEP was negative. */
775 misalign = size_binop (PLUS_EXPR, misalign, offset);
778 /* Modulo alignment. */
779 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
781 if (!tree_fits_uhwi_p (misalign))
783 /* Negative or overflowed misalignment value. */
784 if (dump_enabled_p ())
785 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
786 "unexpected misalign value\n");
787 return false;
790 SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
792 if (dump_enabled_p ())
794 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
795 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
796 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
797 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
800 return true;
804 /* Function vect_compute_data_refs_alignment
806 Compute the misalignment of data references in the loop.
807 Return FALSE if a data reference is found that cannot be vectorized. */
809 static bool
810 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
811 bb_vec_info bb_vinfo)
813 vec<data_reference_p> datarefs;
814 struct data_reference *dr;
815 unsigned int i;
817 if (loop_vinfo)
818 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
819 else
820 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
822 FOR_EACH_VEC_ELT (datarefs, i, dr)
823 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
824 && !vect_compute_data_ref_alignment (dr))
826 if (bb_vinfo)
828 /* Mark unsupported statement as unvectorizable. */
829 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
830 continue;
832 else
833 return false;
836 return true;
840 /* Function vect_update_misalignment_for_peel
842 DR - the data reference whose misalignment is to be adjusted.
843 DR_PEEL - the data reference whose misalignment is being made
844 zero in the vector loop by the peel.
845 NPEEL - the number of iterations in the peel loop if the misalignment
846 of DR_PEEL is known at compile time. */
848 static void
849 vect_update_misalignment_for_peel (struct data_reference *dr,
850 struct data_reference *dr_peel, int npeel)
852 unsigned int i;
853 vec<dr_p> same_align_drs;
854 struct data_reference *current_dr;
855 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
856 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
857 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
858 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
860 /* For interleaved data accesses the step in the loop must be multiplied by
861 the size of the interleaving group. */
862 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
863 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
864 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
865 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
867 /* It can be assumed that the data refs with the same alignment as dr_peel
868 are aligned in the vector loop. */
869 same_align_drs
870 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
871 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
873 if (current_dr != dr)
874 continue;
875 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
876 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
877 SET_DR_MISALIGNMENT (dr, 0);
878 return;
881 if (known_alignment_for_access_p (dr)
882 && known_alignment_for_access_p (dr_peel))
884 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
885 int misal = DR_MISALIGNMENT (dr);
886 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
887 misal += negative ? -npeel * dr_size : npeel * dr_size;
888 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
889 SET_DR_MISALIGNMENT (dr, misal);
890 return;
893 if (dump_enabled_p ())
894 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
895 SET_DR_MISALIGNMENT (dr, -1);
899 /* Function vect_verify_datarefs_alignment
901 Return TRUE if all data references in the loop can be
902 handled with respect to alignment. */
904 bool
905 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
907 vec<data_reference_p> datarefs;
908 struct data_reference *dr;
909 enum dr_alignment_support supportable_dr_alignment;
910 unsigned int i;
912 if (loop_vinfo)
913 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
914 else
915 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
917 FOR_EACH_VEC_ELT (datarefs, i, dr)
919 gimple stmt = DR_STMT (dr);
920 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
922 if (!STMT_VINFO_RELEVANT_P (stmt_info))
923 continue;
925 /* For interleaving, only the alignment of the first access matters.
926 Skip statements marked as not vectorizable. */
927 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
928 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
929 || !STMT_VINFO_VECTORIZABLE (stmt_info))
930 continue;
932 /* Strided loads perform only component accesses, alignment is
933 irrelevant for them. */
934 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
935 continue;
937 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
938 if (!supportable_dr_alignment)
940 if (dump_enabled_p ())
942 if (DR_IS_READ (dr))
943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
944 "not vectorized: unsupported unaligned load.");
945 else
946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
947 "not vectorized: unsupported unaligned "
948 "store.");
950 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
951 DR_REF (dr));
952 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
954 return false;
956 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
957 dump_printf_loc (MSG_NOTE, vect_location,
958 "Vectorizing an unaligned access.\n");
960 return true;
963 /* Given an memory reference EXP return whether its alignment is less
964 than its size. */
966 static bool
967 not_size_aligned (tree exp)
969 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
970 return true;
972 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
973 > get_object_alignment (exp));
976 /* Function vector_alignment_reachable_p
978 Return true if vector alignment for DR is reachable by peeling
979 a few loop iterations. Return false otherwise. */
981 static bool
982 vector_alignment_reachable_p (struct data_reference *dr)
984 gimple stmt = DR_STMT (dr);
985 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
986 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
988 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
990 /* For interleaved access we peel only if number of iterations in
991 the prolog loop ({VF - misalignment}), is a multiple of the
992 number of the interleaved accesses. */
993 int elem_size, mis_in_elements;
994 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
996 /* FORNOW: handle only known alignment. */
997 if (!known_alignment_for_access_p (dr))
998 return false;
1000 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1001 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1003 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1004 return false;
1007 /* If misalignment is known at the compile time then allow peeling
1008 only if natural alignment is reachable through peeling. */
1009 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1011 HOST_WIDE_INT elmsize =
1012 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1013 if (dump_enabled_p ())
1015 dump_printf_loc (MSG_NOTE, vect_location,
1016 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1017 dump_printf (MSG_NOTE,
1018 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1020 if (DR_MISALIGNMENT (dr) % elmsize)
1022 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1024 "data size does not divide the misalignment.\n");
1025 return false;
1029 if (!known_alignment_for_access_p (dr))
1031 tree type = TREE_TYPE (DR_REF (dr));
1032 bool is_packed = not_size_aligned (DR_REF (dr));
1033 if (dump_enabled_p ())
1034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1035 "Unknown misalignment, is_packed = %d\n",is_packed);
1036 if ((TYPE_USER_ALIGN (type) && !is_packed)
1037 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1038 return true;
1039 else
1040 return false;
1043 return true;
1047 /* Calculate the cost of the memory access represented by DR. */
1049 static void
1050 vect_get_data_access_cost (struct data_reference *dr,
1051 unsigned int *inside_cost,
1052 unsigned int *outside_cost,
1053 stmt_vector_for_cost *body_cost_vec)
1055 gimple stmt = DR_STMT (dr);
1056 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1057 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1058 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1059 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1060 int ncopies = vf / nunits;
1062 if (DR_IS_READ (dr))
1063 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1064 NULL, body_cost_vec, false);
1065 else
1066 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1068 if (dump_enabled_p ())
1069 dump_printf_loc (MSG_NOTE, vect_location,
1070 "vect_get_data_access_cost: inside_cost = %d, "
1071 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1075 /* Insert DR into peeling hash table with NPEEL as key. */
1077 static void
1078 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1079 int npeel)
1081 struct _vect_peel_info elem, *slot;
1082 _vect_peel_info **new_slot;
1083 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1085 elem.npeel = npeel;
1086 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
1087 if (slot)
1088 slot->count++;
1089 else
1091 slot = XNEW (struct _vect_peel_info);
1092 slot->npeel = npeel;
1093 slot->dr = dr;
1094 slot->count = 1;
1095 new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
1096 *new_slot = slot;
1099 if (!supportable_dr_alignment && unlimited_cost_model ())
1100 slot->count += VECT_MAX_COST;
1104 /* Traverse peeling hash table to find peeling option that aligns maximum
1105 number of data accesses. */
1108 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1109 _vect_peel_extended_info *max)
1111 vect_peel_info elem = *slot;
1113 if (elem->count > max->peel_info.count
1114 || (elem->count == max->peel_info.count
1115 && max->peel_info.npeel > elem->npeel))
1117 max->peel_info.npeel = elem->npeel;
1118 max->peel_info.count = elem->count;
1119 max->peel_info.dr = elem->dr;
1122 return 1;
1126 /* Traverse peeling hash table and calculate cost for each peeling option.
1127 Find the one with the lowest cost. */
1130 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1131 _vect_peel_extended_info *min)
1133 vect_peel_info elem = *slot;
1134 int save_misalignment, dummy;
1135 unsigned int inside_cost = 0, outside_cost = 0, i;
1136 gimple stmt = DR_STMT (elem->dr);
1137 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1138 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1139 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1140 struct data_reference *dr;
1141 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1142 int single_iter_cost;
1144 prologue_cost_vec.create (2);
1145 body_cost_vec.create (2);
1146 epilogue_cost_vec.create (2);
1148 FOR_EACH_VEC_ELT (datarefs, i, dr)
1150 stmt = DR_STMT (dr);
1151 stmt_info = vinfo_for_stmt (stmt);
1152 /* For interleaving, only the alignment of the first access
1153 matters. */
1154 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1155 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1156 continue;
1158 save_misalignment = DR_MISALIGNMENT (dr);
1159 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1160 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1161 &body_cost_vec);
1162 SET_DR_MISALIGNMENT (dr, save_misalignment);
1165 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1166 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1167 &dummy, single_iter_cost,
1168 &prologue_cost_vec,
1169 &epilogue_cost_vec);
1171 /* Prologue and epilogue costs are added to the target model later.
1172 These costs depend only on the scalar iteration cost, the
1173 number of peeling iterations finally chosen, and the number of
1174 misaligned statements. So discard the information found here. */
1175 prologue_cost_vec.release ();
1176 epilogue_cost_vec.release ();
1178 if (inside_cost < min->inside_cost
1179 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1181 min->inside_cost = inside_cost;
1182 min->outside_cost = outside_cost;
1183 min->body_cost_vec.release ();
1184 min->body_cost_vec = body_cost_vec;
1185 min->peel_info.dr = elem->dr;
1186 min->peel_info.npeel = elem->npeel;
1188 else
1189 body_cost_vec.release ();
1191 return 1;
1195 /* Choose best peeling option by traversing peeling hash table and either
1196 choosing an option with the lowest cost (if cost model is enabled) or the
1197 option that aligns as many accesses as possible. */
1199 static struct data_reference *
1200 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1201 unsigned int *npeel,
1202 stmt_vector_for_cost *body_cost_vec)
1204 struct _vect_peel_extended_info res;
1206 res.peel_info.dr = NULL;
1207 res.body_cost_vec = stmt_vector_for_cost ();
1209 if (!unlimited_cost_model ())
1211 res.inside_cost = INT_MAX;
1212 res.outside_cost = INT_MAX;
1213 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1214 .traverse <_vect_peel_extended_info *,
1215 vect_peeling_hash_get_lowest_cost> (&res);
1217 else
1219 res.peel_info.count = 0;
1220 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1221 .traverse <_vect_peel_extended_info *,
1222 vect_peeling_hash_get_most_frequent> (&res);
1225 *npeel = res.peel_info.npeel;
1226 *body_cost_vec = res.body_cost_vec;
1227 return res.peel_info.dr;
1231 /* Function vect_enhance_data_refs_alignment
1233 This pass will use loop versioning and loop peeling in order to enhance
1234 the alignment of data references in the loop.
1236 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1237 original loop is to be vectorized. Any other loops that are created by
1238 the transformations performed in this pass - are not supposed to be
1239 vectorized. This restriction will be relaxed.
1241 This pass will require a cost model to guide it whether to apply peeling
1242 or versioning or a combination of the two. For example, the scheme that
1243 intel uses when given a loop with several memory accesses, is as follows:
1244 choose one memory access ('p') which alignment you want to force by doing
1245 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1246 other accesses are not necessarily aligned, or (2) use loop versioning to
1247 generate one loop in which all accesses are aligned, and another loop in
1248 which only 'p' is necessarily aligned.
1250 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1251 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1252 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1254 Devising a cost model is the most critical aspect of this work. It will
1255 guide us on which access to peel for, whether to use loop versioning, how
1256 many versions to create, etc. The cost model will probably consist of
1257 generic considerations as well as target specific considerations (on
1258 powerpc for example, misaligned stores are more painful than misaligned
1259 loads).
1261 Here are the general steps involved in alignment enhancements:
1263 -- original loop, before alignment analysis:
1264 for (i=0; i<N; i++){
1265 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1266 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1269 -- After vect_compute_data_refs_alignment:
1270 for (i=0; i<N; i++){
1271 x = q[i]; # DR_MISALIGNMENT(q) = 3
1272 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1275 -- Possibility 1: we do loop versioning:
1276 if (p is aligned) {
1277 for (i=0; i<N; i++){ # loop 1A
1278 x = q[i]; # DR_MISALIGNMENT(q) = 3
1279 p[i] = y; # DR_MISALIGNMENT(p) = 0
1282 else {
1283 for (i=0; i<N; i++){ # loop 1B
1284 x = q[i]; # DR_MISALIGNMENT(q) = 3
1285 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1289 -- Possibility 2: we do loop peeling:
1290 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1291 x = q[i];
1292 p[i] = y;
1294 for (i = 3; i < N; i++){ # loop 2A
1295 x = q[i]; # DR_MISALIGNMENT(q) = 0
1296 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1299 -- Possibility 3: combination of loop peeling and versioning:
1300 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1301 x = q[i];
1302 p[i] = y;
1304 if (p is aligned) {
1305 for (i = 3; i<N; i++){ # loop 3A
1306 x = q[i]; # DR_MISALIGNMENT(q) = 0
1307 p[i] = y; # DR_MISALIGNMENT(p) = 0
1310 else {
1311 for (i = 3; i<N; i++){ # loop 3B
1312 x = q[i]; # DR_MISALIGNMENT(q) = 0
1313 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1317 These loops are later passed to loop_transform to be vectorized. The
1318 vectorizer will use the alignment information to guide the transformation
1319 (whether to generate regular loads/stores, or with special handling for
1320 misalignment). */
1322 bool
1323 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1325 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1326 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1327 enum dr_alignment_support supportable_dr_alignment;
1328 struct data_reference *dr0 = NULL, *first_store = NULL;
1329 struct data_reference *dr;
1330 unsigned int i, j;
1331 bool do_peeling = false;
1332 bool do_versioning = false;
1333 bool stat;
1334 gimple stmt;
1335 stmt_vec_info stmt_info;
1336 unsigned int npeel = 0;
1337 bool all_misalignments_unknown = true;
1338 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1339 unsigned possible_npeel_number = 1;
1340 tree vectype;
1341 unsigned int nelements, mis, same_align_drs_max = 0;
1342 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1344 if (dump_enabled_p ())
1345 dump_printf_loc (MSG_NOTE, vect_location,
1346 "=== vect_enhance_data_refs_alignment ===\n");
1348 /* While cost model enhancements are expected in the future, the high level
1349 view of the code at this time is as follows:
1351 A) If there is a misaligned access then see if peeling to align
1352 this access can make all data references satisfy
1353 vect_supportable_dr_alignment. If so, update data structures
1354 as needed and return true.
1356 B) If peeling wasn't possible and there is a data reference with an
1357 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1358 then see if loop versioning checks can be used to make all data
1359 references satisfy vect_supportable_dr_alignment. If so, update
1360 data structures as needed and return true.
1362 C) If neither peeling nor versioning were successful then return false if
1363 any data reference does not satisfy vect_supportable_dr_alignment.
1365 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1367 Note, Possibility 3 above (which is peeling and versioning together) is not
1368 being done at this time. */
1370 /* (1) Peeling to force alignment. */
1372 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1373 Considerations:
1374 + How many accesses will become aligned due to the peeling
1375 - How many accesses will become unaligned due to the peeling,
1376 and the cost of misaligned accesses.
1377 - The cost of peeling (the extra runtime checks, the increase
1378 in code size). */
1380 FOR_EACH_VEC_ELT (datarefs, i, dr)
1382 stmt = DR_STMT (dr);
1383 stmt_info = vinfo_for_stmt (stmt);
1385 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1386 continue;
1388 /* For interleaving, only the alignment of the first access
1389 matters. */
1390 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1391 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1392 continue;
1394 /* For invariant accesses there is nothing to enhance. */
1395 if (integer_zerop (DR_STEP (dr)))
1396 continue;
1398 /* Strided loads perform only component accesses, alignment is
1399 irrelevant for them. */
1400 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1401 continue;
1403 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1404 do_peeling = vector_alignment_reachable_p (dr);
1405 if (do_peeling)
1407 if (known_alignment_for_access_p (dr))
1409 unsigned int npeel_tmp;
1410 bool negative = tree_int_cst_compare (DR_STEP (dr),
1411 size_zero_node) < 0;
1413 /* Save info about DR in the hash table. */
1414 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1415 LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
1417 vectype = STMT_VINFO_VECTYPE (stmt_info);
1418 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1419 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1420 TREE_TYPE (DR_REF (dr))));
1421 npeel_tmp = (negative
1422 ? (mis - nelements) : (nelements - mis))
1423 & (nelements - 1);
1425 /* For multiple types, it is possible that the bigger type access
1426 will have more than one peeling option. E.g., a loop with two
1427 types: one of size (vector size / 4), and the other one of
1428 size (vector size / 8). Vectorization factor will 8. If both
1429 access are misaligned by 3, the first one needs one scalar
1430 iteration to be aligned, and the second one needs 5. But the
1431 the first one will be aligned also by peeling 5 scalar
1432 iterations, and in that case both accesses will be aligned.
1433 Hence, except for the immediate peeling amount, we also want
1434 to try to add full vector size, while we don't exceed
1435 vectorization factor.
1436 We do this automtically for cost model, since we calculate cost
1437 for every peeling option. */
1438 if (unlimited_cost_model ())
1439 possible_npeel_number = vf /nelements;
1441 /* Handle the aligned case. We may decide to align some other
1442 access, making DR unaligned. */
1443 if (DR_MISALIGNMENT (dr) == 0)
1445 npeel_tmp = 0;
1446 if (unlimited_cost_model ())
1447 possible_npeel_number++;
1450 for (j = 0; j < possible_npeel_number; j++)
1452 gcc_assert (npeel_tmp <= vf);
1453 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1454 npeel_tmp += nelements;
1457 all_misalignments_unknown = false;
1458 /* Data-ref that was chosen for the case that all the
1459 misalignments are unknown is not relevant anymore, since we
1460 have a data-ref with known alignment. */
1461 dr0 = NULL;
1463 else
1465 /* If we don't know any misalignment values, we prefer
1466 peeling for data-ref that has the maximum number of data-refs
1467 with the same alignment, unless the target prefers to align
1468 stores over load. */
1469 if (all_misalignments_unknown)
1471 unsigned same_align_drs
1472 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1473 if (!dr0
1474 || same_align_drs_max < same_align_drs)
1476 same_align_drs_max = same_align_drs;
1477 dr0 = dr;
1479 /* For data-refs with the same number of related
1480 accesses prefer the one where the misalign
1481 computation will be invariant in the outermost loop. */
1482 else if (same_align_drs_max == same_align_drs)
1484 struct loop *ivloop0, *ivloop;
1485 ivloop0 = outermost_invariant_loop_for_expr
1486 (loop, DR_BASE_ADDRESS (dr0));
1487 ivloop = outermost_invariant_loop_for_expr
1488 (loop, DR_BASE_ADDRESS (dr));
1489 if ((ivloop && !ivloop0)
1490 || (ivloop && ivloop0
1491 && flow_loop_nested_p (ivloop, ivloop0)))
1492 dr0 = dr;
1495 if (!first_store && DR_IS_WRITE (dr))
1496 first_store = dr;
1499 /* If there are both known and unknown misaligned accesses in the
1500 loop, we choose peeling amount according to the known
1501 accesses. */
1502 if (!supportable_dr_alignment)
1504 dr0 = dr;
1505 if (!first_store && DR_IS_WRITE (dr))
1506 first_store = dr;
1510 else
1512 if (!aligned_access_p (dr))
1514 if (dump_enabled_p ())
1515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1516 "vector alignment may not be reachable\n");
1517 break;
1522 /* Check if we can possibly peel the loop. */
1523 if (!vect_can_advance_ivs_p (loop_vinfo)
1524 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1525 do_peeling = false;
1527 if (do_peeling && all_misalignments_unknown
1528 && vect_supportable_dr_alignment (dr0, false))
1531 /* Check if the target requires to prefer stores over loads, i.e., if
1532 misaligned stores are more expensive than misaligned loads (taking
1533 drs with same alignment into account). */
1534 if (first_store && DR_IS_READ (dr0))
1536 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1537 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1538 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1539 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1540 stmt_vector_for_cost dummy;
1541 dummy.create (2);
1543 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1544 &dummy);
1545 vect_get_data_access_cost (first_store, &store_inside_cost,
1546 &store_outside_cost, &dummy);
1548 dummy.release ();
1550 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1551 aligning the load DR0). */
1552 load_inside_penalty = store_inside_cost;
1553 load_outside_penalty = store_outside_cost;
1554 for (i = 0;
1555 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1556 DR_STMT (first_store))).iterate (i, &dr);
1557 i++)
1558 if (DR_IS_READ (dr))
1560 load_inside_penalty += load_inside_cost;
1561 load_outside_penalty += load_outside_cost;
1563 else
1565 load_inside_penalty += store_inside_cost;
1566 load_outside_penalty += store_outside_cost;
1569 /* Calculate the penalty for leaving DR0 unaligned (by
1570 aligning the FIRST_STORE). */
1571 store_inside_penalty = load_inside_cost;
1572 store_outside_penalty = load_outside_cost;
1573 for (i = 0;
1574 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1575 DR_STMT (dr0))).iterate (i, &dr);
1576 i++)
1577 if (DR_IS_READ (dr))
1579 store_inside_penalty += load_inside_cost;
1580 store_outside_penalty += load_outside_cost;
1582 else
1584 store_inside_penalty += store_inside_cost;
1585 store_outside_penalty += store_outside_cost;
1588 if (load_inside_penalty > store_inside_penalty
1589 || (load_inside_penalty == store_inside_penalty
1590 && load_outside_penalty > store_outside_penalty))
1591 dr0 = first_store;
1594 /* In case there are only loads with different unknown misalignments, use
1595 peeling only if it may help to align other accesses in the loop. */
1596 if (!first_store
1597 && !STMT_VINFO_SAME_ALIGN_REFS (
1598 vinfo_for_stmt (DR_STMT (dr0))).length ()
1599 && vect_supportable_dr_alignment (dr0, false)
1600 != dr_unaligned_supported)
1601 do_peeling = false;
1604 if (do_peeling && !dr0)
1606 /* Peeling is possible, but there is no data access that is not supported
1607 unless aligned. So we try to choose the best possible peeling. */
1609 /* We should get here only if there are drs with known misalignment. */
1610 gcc_assert (!all_misalignments_unknown);
1612 /* Choose the best peeling from the hash table. */
1613 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1614 &body_cost_vec);
1615 if (!dr0 || !npeel)
1616 do_peeling = false;
1619 if (do_peeling)
1621 stmt = DR_STMT (dr0);
1622 stmt_info = vinfo_for_stmt (stmt);
1623 vectype = STMT_VINFO_VECTYPE (stmt_info);
1624 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1626 if (known_alignment_for_access_p (dr0))
1628 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1629 size_zero_node) < 0;
1630 if (!npeel)
1632 /* Since it's known at compile time, compute the number of
1633 iterations in the peeled loop (the peeling factor) for use in
1634 updating DR_MISALIGNMENT values. The peeling factor is the
1635 vectorization factor minus the misalignment as an element
1636 count. */
1637 mis = DR_MISALIGNMENT (dr0);
1638 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1639 npeel = ((negative ? mis - nelements : nelements - mis)
1640 & (nelements - 1));
1643 /* For interleaved data access every iteration accesses all the
1644 members of the group, therefore we divide the number of iterations
1645 by the group size. */
1646 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1647 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1648 npeel /= GROUP_SIZE (stmt_info);
1650 if (dump_enabled_p ())
1651 dump_printf_loc (MSG_NOTE, vect_location,
1652 "Try peeling by %d\n", npeel);
1655 /* Ensure that all data refs can be vectorized after the peel. */
1656 FOR_EACH_VEC_ELT (datarefs, i, dr)
1658 int save_misalignment;
1660 if (dr == dr0)
1661 continue;
1663 stmt = DR_STMT (dr);
1664 stmt_info = vinfo_for_stmt (stmt);
1665 /* For interleaving, only the alignment of the first access
1666 matters. */
1667 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1668 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1669 continue;
1671 /* Strided loads perform only component accesses, alignment is
1672 irrelevant for them. */
1673 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1674 continue;
1676 save_misalignment = DR_MISALIGNMENT (dr);
1677 vect_update_misalignment_for_peel (dr, dr0, npeel);
1678 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1679 SET_DR_MISALIGNMENT (dr, save_misalignment);
1681 if (!supportable_dr_alignment)
1683 do_peeling = false;
1684 break;
1688 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1690 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1691 if (!stat)
1692 do_peeling = false;
1693 else
1695 body_cost_vec.release ();
1696 return stat;
1700 if (do_peeling)
1702 unsigned max_allowed_peel
1703 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1704 if (max_allowed_peel != (unsigned)-1)
1706 unsigned max_peel = npeel;
1707 if (max_peel == 0)
1709 gimple dr_stmt = DR_STMT (dr0);
1710 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1711 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1712 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1714 if (max_peel > max_allowed_peel)
1716 do_peeling = false;
1717 if (dump_enabled_p ())
1718 dump_printf_loc (MSG_NOTE, vect_location,
1719 "Disable peeling, max peels reached: %d\n", max_peel);
1724 if (do_peeling)
1726 stmt_info_for_cost *si;
1727 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1729 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1730 If the misalignment of DR_i is identical to that of dr0 then set
1731 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1732 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1733 by the peeling factor times the element size of DR_i (MOD the
1734 vectorization factor times the size). Otherwise, the
1735 misalignment of DR_i must be set to unknown. */
1736 FOR_EACH_VEC_ELT (datarefs, i, dr)
1737 if (dr != dr0)
1738 vect_update_misalignment_for_peel (dr, dr0, npeel);
1740 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1741 if (npeel)
1742 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1743 else
1744 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1745 = DR_MISALIGNMENT (dr0);
1746 SET_DR_MISALIGNMENT (dr0, 0);
1747 if (dump_enabled_p ())
1749 dump_printf_loc (MSG_NOTE, vect_location,
1750 "Alignment of access forced using peeling.\n");
1751 dump_printf_loc (MSG_NOTE, vect_location,
1752 "Peeling for alignment will be applied.\n");
1754 /* We've delayed passing the inside-loop peeling costs to the
1755 target cost model until we were sure peeling would happen.
1756 Do so now. */
1757 if (body_cost_vec.exists ())
1759 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1761 struct _stmt_vec_info *stmt_info
1762 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1763 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1764 si->misalign, vect_body);
1766 body_cost_vec.release ();
1769 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1770 gcc_assert (stat);
1771 return stat;
1775 body_cost_vec.release ();
1777 /* (2) Versioning to force alignment. */
1779 /* Try versioning if:
1780 1) optimize loop for speed
1781 2) there is at least one unsupported misaligned data ref with an unknown
1782 misalignment, and
1783 3) all misaligned data refs with a known misalignment are supported, and
1784 4) the number of runtime alignment checks is within reason. */
1786 do_versioning =
1787 optimize_loop_nest_for_speed_p (loop)
1788 && (!loop->inner); /* FORNOW */
1790 if (do_versioning)
1792 FOR_EACH_VEC_ELT (datarefs, i, dr)
1794 stmt = DR_STMT (dr);
1795 stmt_info = vinfo_for_stmt (stmt);
1797 /* For interleaving, only the alignment of the first access
1798 matters. */
1799 if (aligned_access_p (dr)
1800 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1801 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1802 continue;
1804 /* Strided loads perform only component accesses, alignment is
1805 irrelevant for them. */
1806 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1807 continue;
1809 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1811 if (!supportable_dr_alignment)
1813 gimple stmt;
1814 int mask;
1815 tree vectype;
1817 if (known_alignment_for_access_p (dr)
1818 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1819 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1821 do_versioning = false;
1822 break;
1825 stmt = DR_STMT (dr);
1826 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1827 gcc_assert (vectype);
1829 /* The rightmost bits of an aligned address must be zeros.
1830 Construct the mask needed for this test. For example,
1831 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1832 mask must be 15 = 0xf. */
1833 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1835 /* FORNOW: use the same mask to test all potentially unaligned
1836 references in the loop. The vectorizer currently supports
1837 a single vector size, see the reference to
1838 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1839 vectorization factor is computed. */
1840 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1841 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1842 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1843 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1844 DR_STMT (dr));
1848 /* Versioning requires at least one misaligned data reference. */
1849 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1850 do_versioning = false;
1851 else if (!do_versioning)
1852 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1855 if (do_versioning)
1857 vec<gimple> may_misalign_stmts
1858 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1859 gimple stmt;
1861 /* It can now be assumed that the data references in the statements
1862 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1863 of the loop being vectorized. */
1864 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1866 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1867 dr = STMT_VINFO_DATA_REF (stmt_info);
1868 SET_DR_MISALIGNMENT (dr, 0);
1869 if (dump_enabled_p ())
1870 dump_printf_loc (MSG_NOTE, vect_location,
1871 "Alignment of access forced using versioning.\n");
1874 if (dump_enabled_p ())
1875 dump_printf_loc (MSG_NOTE, vect_location,
1876 "Versioning for alignment will be applied.\n");
1878 /* Peeling and versioning can't be done together at this time. */
1879 gcc_assert (! (do_peeling && do_versioning));
1881 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1882 gcc_assert (stat);
1883 return stat;
1886 /* This point is reached if neither peeling nor versioning is being done. */
1887 gcc_assert (! (do_peeling || do_versioning));
1889 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1890 return stat;
1894 /* Function vect_find_same_alignment_drs.
1896 Update group and alignment relations according to the chosen
1897 vectorization factor. */
1899 static void
1900 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1901 loop_vec_info loop_vinfo)
1903 unsigned int i;
1904 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1905 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1906 struct data_reference *dra = DDR_A (ddr);
1907 struct data_reference *drb = DDR_B (ddr);
1908 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1909 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1910 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1911 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1912 lambda_vector dist_v;
1913 unsigned int loop_depth;
1915 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1916 return;
1918 if (dra == drb)
1919 return;
1921 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1922 return;
1924 /* Loop-based vectorization and known data dependence. */
1925 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1926 return;
1928 /* Data-dependence analysis reports a distance vector of zero
1929 for data-references that overlap only in the first iteration
1930 but have different sign step (see PR45764).
1931 So as a sanity check require equal DR_STEP. */
1932 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1933 return;
1935 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1936 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1938 int dist = dist_v[loop_depth];
1940 if (dump_enabled_p ())
1941 dump_printf_loc (MSG_NOTE, vect_location,
1942 "dependence distance = %d.\n", dist);
1944 /* Same loop iteration. */
1945 if (dist == 0
1946 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1948 /* Two references with distance zero have the same alignment. */
1949 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1950 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1951 if (dump_enabled_p ())
1953 dump_printf_loc (MSG_NOTE, vect_location,
1954 "accesses have the same alignment.\n");
1955 dump_printf (MSG_NOTE,
1956 "dependence distance modulo vf == 0 between ");
1957 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1958 dump_printf (MSG_NOTE, " and ");
1959 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1960 dump_printf (MSG_NOTE, "\n");
1967 /* Function vect_analyze_data_refs_alignment
1969 Analyze the alignment of the data-references in the loop.
1970 Return FALSE if a data reference is found that cannot be vectorized. */
1972 bool
1973 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1974 bb_vec_info bb_vinfo)
1976 if (dump_enabled_p ())
1977 dump_printf_loc (MSG_NOTE, vect_location,
1978 "=== vect_analyze_data_refs_alignment ===\n");
1980 /* Mark groups of data references with same alignment using
1981 data dependence information. */
1982 if (loop_vinfo)
1984 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1985 struct data_dependence_relation *ddr;
1986 unsigned int i;
1988 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1989 vect_find_same_alignment_drs (ddr, loop_vinfo);
1992 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1994 if (dump_enabled_p ())
1995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1996 "not vectorized: can't calculate alignment "
1997 "for data ref.\n");
1998 return false;
2001 return true;
2005 /* Analyze groups of accesses: check that DR belongs to a group of
2006 accesses of legal size, step, etc. Detect gaps, single element
2007 interleaving, and other special cases. Set grouped access info.
2008 Collect groups of strided stores for further use in SLP analysis. */
2010 static bool
2011 vect_analyze_group_access (struct data_reference *dr)
2013 tree step = DR_STEP (dr);
2014 tree scalar_type = TREE_TYPE (DR_REF (dr));
2015 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2016 gimple stmt = DR_STMT (dr);
2017 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2018 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2019 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2020 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2021 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2022 bool slp_impossible = false;
2023 struct loop *loop = NULL;
2025 if (loop_vinfo)
2026 loop = LOOP_VINFO_LOOP (loop_vinfo);
2028 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2029 size of the interleaving group (including gaps). */
2030 groupsize = absu_hwi (dr_step) / type_size;
2032 /* Not consecutive access is possible only if it is a part of interleaving. */
2033 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2035 /* Check if it this DR is a part of interleaving, and is a single
2036 element of the group that is accessed in the loop. */
2038 /* Gaps are supported only for loads. STEP must be a multiple of the type
2039 size. The size of the group must be a power of 2. */
2040 if (DR_IS_READ (dr)
2041 && (dr_step % type_size) == 0
2042 && groupsize > 0
2043 && exact_log2 (groupsize) != -1)
2045 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2046 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2047 if (dump_enabled_p ())
2049 dump_printf_loc (MSG_NOTE, vect_location,
2050 "Detected single element interleaving ");
2051 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2052 dump_printf (MSG_NOTE, " step ");
2053 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2054 dump_printf (MSG_NOTE, "\n");
2057 if (loop_vinfo)
2059 if (dump_enabled_p ())
2060 dump_printf_loc (MSG_NOTE, vect_location,
2061 "Data access with gaps requires scalar "
2062 "epilogue loop\n");
2063 if (loop->inner)
2065 if (dump_enabled_p ())
2066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2067 "Peeling for outer loop is not"
2068 " supported\n");
2069 return false;
2072 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2075 return true;
2078 if (dump_enabled_p ())
2080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2081 "not consecutive access ");
2082 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2083 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2086 if (bb_vinfo)
2088 /* Mark the statement as unvectorizable. */
2089 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2090 return true;
2093 return false;
2096 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2098 /* First stmt in the interleaving chain. Check the chain. */
2099 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2100 struct data_reference *data_ref = dr;
2101 unsigned int count = 1;
2102 tree prev_init = DR_INIT (data_ref);
2103 gimple prev = stmt;
2104 HOST_WIDE_INT diff, gaps = 0;
2105 unsigned HOST_WIDE_INT count_in_bytes;
2107 while (next)
2109 /* Skip same data-refs. In case that two or more stmts share
2110 data-ref (supported only for loads), we vectorize only the first
2111 stmt, and the rest get their vectorized loads from the first
2112 one. */
2113 if (!tree_int_cst_compare (DR_INIT (data_ref),
2114 DR_INIT (STMT_VINFO_DATA_REF (
2115 vinfo_for_stmt (next)))))
2117 if (DR_IS_WRITE (data_ref))
2119 if (dump_enabled_p ())
2120 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2121 "Two store stmts share the same dr.\n");
2122 return false;
2125 /* For load use the same data-ref load. */
2126 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2128 prev = next;
2129 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2130 continue;
2133 prev = next;
2134 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2136 /* All group members have the same STEP by construction. */
2137 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2139 /* Check that the distance between two accesses is equal to the type
2140 size. Otherwise, we have gaps. */
2141 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2142 - TREE_INT_CST_LOW (prev_init)) / type_size;
2143 if (diff != 1)
2145 /* FORNOW: SLP of accesses with gaps is not supported. */
2146 slp_impossible = true;
2147 if (DR_IS_WRITE (data_ref))
2149 if (dump_enabled_p ())
2150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2151 "interleaved store with gaps\n");
2152 return false;
2155 gaps += diff - 1;
2158 last_accessed_element += diff;
2160 /* Store the gap from the previous member of the group. If there is no
2161 gap in the access, GROUP_GAP is always 1. */
2162 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2164 prev_init = DR_INIT (data_ref);
2165 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2166 /* Count the number of data-refs in the chain. */
2167 count++;
2170 /* COUNT is the number of accesses found, we multiply it by the size of
2171 the type to get COUNT_IN_BYTES. */
2172 count_in_bytes = type_size * count;
2174 /* Check that the size of the interleaving (including gaps) is not
2175 greater than STEP. */
2176 if (dr_step != 0
2177 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2179 if (dump_enabled_p ())
2181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182 "interleaving size is greater than step for ");
2183 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2184 DR_REF (dr));
2185 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2187 return false;
2190 /* Check that the size of the interleaving is equal to STEP for stores,
2191 i.e., that there are no gaps. */
2192 if (dr_step != 0
2193 && absu_hwi (dr_step) != count_in_bytes)
2195 if (DR_IS_READ (dr))
2197 slp_impossible = true;
2198 /* There is a gap after the last load in the group. This gap is a
2199 difference between the groupsize and the number of elements.
2200 When there is no gap, this difference should be 0. */
2201 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2203 else
2205 if (dump_enabled_p ())
2206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2207 "interleaved store with gaps\n");
2208 return false;
2212 /* Check that STEP is a multiple of type size. */
2213 if (dr_step != 0
2214 && (dr_step % type_size) != 0)
2216 if (dump_enabled_p ())
2218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2219 "step is not a multiple of type size: step ");
2220 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2221 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2222 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2223 TYPE_SIZE_UNIT (scalar_type));
2224 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2226 return false;
2229 if (groupsize == 0)
2230 groupsize = count;
2232 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2233 if (dump_enabled_p ())
2234 dump_printf_loc (MSG_NOTE, vect_location,
2235 "Detected interleaving of size %d\n", (int)groupsize);
2237 /* SLP: create an SLP data structure for every interleaving group of
2238 stores for further analysis in vect_analyse_slp. */
2239 if (DR_IS_WRITE (dr) && !slp_impossible)
2241 if (loop_vinfo)
2242 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2243 if (bb_vinfo)
2244 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2247 /* There is a gap in the end of the group. */
2248 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2250 if (dump_enabled_p ())
2251 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2252 "Data access with gaps requires scalar "
2253 "epilogue loop\n");
2254 if (loop->inner)
2256 if (dump_enabled_p ())
2257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2258 "Peeling for outer loop is not supported\n");
2259 return false;
2262 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2266 return true;
2270 /* Analyze the access pattern of the data-reference DR.
2271 In case of non-consecutive accesses call vect_analyze_group_access() to
2272 analyze groups of accesses. */
2274 static bool
2275 vect_analyze_data_ref_access (struct data_reference *dr)
2277 tree step = DR_STEP (dr);
2278 tree scalar_type = TREE_TYPE (DR_REF (dr));
2279 gimple stmt = DR_STMT (dr);
2280 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2281 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2282 struct loop *loop = NULL;
2284 if (loop_vinfo)
2285 loop = LOOP_VINFO_LOOP (loop_vinfo);
2287 if (loop_vinfo && !step)
2289 if (dump_enabled_p ())
2290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2291 "bad data-ref access in loop\n");
2292 return false;
2295 /* Allow invariant loads in not nested loops. */
2296 if (loop_vinfo && integer_zerop (step))
2298 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2299 if (nested_in_vect_loop_p (loop, stmt))
2301 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_NOTE, vect_location,
2303 "zero step in inner loop of nest\n");
2304 return false;
2306 return DR_IS_READ (dr);
2309 if (loop && nested_in_vect_loop_p (loop, stmt))
2311 /* Interleaved accesses are not yet supported within outer-loop
2312 vectorization for references in the inner-loop. */
2313 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2315 /* For the rest of the analysis we use the outer-loop step. */
2316 step = STMT_VINFO_DR_STEP (stmt_info);
2317 if (integer_zerop (step))
2319 if (dump_enabled_p ())
2320 dump_printf_loc (MSG_NOTE, vect_location,
2321 "zero step in outer loop.\n");
2322 if (DR_IS_READ (dr))
2323 return true;
2324 else
2325 return false;
2329 /* Consecutive? */
2330 if (TREE_CODE (step) == INTEGER_CST)
2332 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2333 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2334 || (dr_step < 0
2335 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2337 /* Mark that it is not interleaving. */
2338 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2339 return true;
2343 if (loop && nested_in_vect_loop_p (loop, stmt))
2345 if (dump_enabled_p ())
2346 dump_printf_loc (MSG_NOTE, vect_location,
2347 "grouped access in outer loop.\n");
2348 return false;
2351 /* Assume this is a DR handled by non-constant strided load case. */
2352 if (TREE_CODE (step) != INTEGER_CST)
2353 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2355 /* Not consecutive access - check if it's a part of interleaving group. */
2356 return vect_analyze_group_access (dr);
2361 /* A helper function used in the comparator function to sort data
2362 references. T1 and T2 are two data references to be compared.
2363 The function returns -1, 0, or 1. */
2365 static int
2366 compare_tree (tree t1, tree t2)
2368 int i, cmp;
2369 enum tree_code code;
2370 char tclass;
2372 if (t1 == t2)
2373 return 0;
2374 if (t1 == NULL)
2375 return -1;
2376 if (t2 == NULL)
2377 return 1;
2380 if (TREE_CODE (t1) != TREE_CODE (t2))
2381 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2383 code = TREE_CODE (t1);
2384 switch (code)
2386 /* For const values, we can just use hash values for comparisons. */
2387 case INTEGER_CST:
2388 case REAL_CST:
2389 case FIXED_CST:
2390 case STRING_CST:
2391 case COMPLEX_CST:
2392 case VECTOR_CST:
2394 hashval_t h1 = iterative_hash_expr (t1, 0);
2395 hashval_t h2 = iterative_hash_expr (t2, 0);
2396 if (h1 != h2)
2397 return h1 < h2 ? -1 : 1;
2398 break;
2401 case SSA_NAME:
2402 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2403 if (cmp != 0)
2404 return cmp;
2406 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2407 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2408 break;
2410 default:
2411 tclass = TREE_CODE_CLASS (code);
2413 /* For var-decl, we could compare their UIDs. */
2414 if (tclass == tcc_declaration)
2416 if (DECL_UID (t1) != DECL_UID (t2))
2417 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2418 break;
2421 /* For expressions with operands, compare their operands recursively. */
2422 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2424 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2425 if (cmp != 0)
2426 return cmp;
2430 return 0;
2434 /* Compare two data-references DRA and DRB to group them into chunks
2435 suitable for grouping. */
2437 static int
2438 dr_group_sort_cmp (const void *dra_, const void *drb_)
2440 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2441 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2442 int cmp;
2444 /* Stabilize sort. */
2445 if (dra == drb)
2446 return 0;
2448 /* Ordering of DRs according to base. */
2449 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2451 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2452 if (cmp != 0)
2453 return cmp;
2456 /* And according to DR_OFFSET. */
2457 if (!dr_equal_offsets_p (dra, drb))
2459 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2460 if (cmp != 0)
2461 return cmp;
2464 /* Put reads before writes. */
2465 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2466 return DR_IS_READ (dra) ? -1 : 1;
2468 /* Then sort after access size. */
2469 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2470 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2472 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2473 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2474 if (cmp != 0)
2475 return cmp;
2478 /* And after step. */
2479 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2481 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2482 if (cmp != 0)
2483 return cmp;
2486 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2487 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2488 if (cmp == 0)
2489 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2490 return cmp;
2493 /* Function vect_analyze_data_ref_accesses.
2495 Analyze the access pattern of all the data references in the loop.
2497 FORNOW: the only access pattern that is considered vectorizable is a
2498 simple step 1 (consecutive) access.
2500 FORNOW: handle only arrays and pointer accesses. */
2502 bool
2503 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2505 unsigned int i;
2506 vec<data_reference_p> datarefs;
2507 struct data_reference *dr;
2509 if (dump_enabled_p ())
2510 dump_printf_loc (MSG_NOTE, vect_location,
2511 "=== vect_analyze_data_ref_accesses ===\n");
2513 if (loop_vinfo)
2514 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2515 else
2516 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2518 if (datarefs.is_empty ())
2519 return true;
2521 /* Sort the array of datarefs to make building the interleaving chains
2522 linear. */
2523 qsort (datarefs.address (), datarefs.length (),
2524 sizeof (data_reference_p), dr_group_sort_cmp);
2526 /* Build the interleaving chains. */
2527 for (i = 0; i < datarefs.length () - 1;)
2529 data_reference_p dra = datarefs[i];
2530 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2531 stmt_vec_info lastinfo = NULL;
2532 for (i = i + 1; i < datarefs.length (); ++i)
2534 data_reference_p drb = datarefs[i];
2535 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2537 /* ??? Imperfect sorting (non-compatible types, non-modulo
2538 accesses, same accesses) can lead to a group to be artificially
2539 split here as we don't just skip over those. If it really
2540 matters we can push those to a worklist and re-iterate
2541 over them. The we can just skip ahead to the next DR here. */
2543 /* Check that the data-refs have same first location (except init)
2544 and they are both either store or load (not load and store). */
2545 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2546 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2547 DR_BASE_ADDRESS (drb), 0)
2548 || !dr_equal_offsets_p (dra, drb))
2549 break;
2551 /* Check that the data-refs have the same constant size and step. */
2552 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2553 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2554 if (!tree_fits_uhwi_p (sza)
2555 || !tree_fits_uhwi_p (szb)
2556 || !tree_int_cst_equal (sza, szb)
2557 || !tree_fits_shwi_p (DR_STEP (dra))
2558 || !tree_fits_shwi_p (DR_STEP (drb))
2559 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2560 break;
2562 /* Do not place the same access in the interleaving chain twice. */
2563 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2564 break;
2566 /* Check the types are compatible.
2567 ??? We don't distinguish this during sorting. */
2568 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2569 TREE_TYPE (DR_REF (drb))))
2570 break;
2572 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2573 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2574 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2575 gcc_assert (init_a < init_b);
2577 /* If init_b == init_a + the size of the type * k, we have an
2578 interleaving, and DRA is accessed before DRB. */
2579 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2580 if ((init_b - init_a) % type_size_a != 0)
2581 break;
2583 /* The step (if not zero) is greater than the difference between
2584 data-refs' inits. This splits groups into suitable sizes. */
2585 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2586 if (step != 0 && step <= (init_b - init_a))
2587 break;
2589 if (dump_enabled_p ())
2591 dump_printf_loc (MSG_NOTE, vect_location,
2592 "Detected interleaving ");
2593 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2594 dump_printf (MSG_NOTE, " and ");
2595 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2596 dump_printf (MSG_NOTE, "\n");
2599 /* Link the found element into the group list. */
2600 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2602 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2603 lastinfo = stmtinfo_a;
2605 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2606 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2607 lastinfo = stmtinfo_b;
2611 FOR_EACH_VEC_ELT (datarefs, i, dr)
2612 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2613 && !vect_analyze_data_ref_access (dr))
2615 if (dump_enabled_p ())
2616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2617 "not vectorized: complicated access pattern.\n");
2619 if (bb_vinfo)
2621 /* Mark the statement as not vectorizable. */
2622 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2623 continue;
2625 else
2626 return false;
2629 return true;
2633 /* Operator == between two dr_with_seg_len objects.
2635 This equality operator is used to make sure two data refs
2636 are the same one so that we will consider to combine the
2637 aliasing checks of those two pairs of data dependent data
2638 refs. */
2640 static bool
2641 operator == (const dr_with_seg_len& d1,
2642 const dr_with_seg_len& d2)
2644 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2645 DR_BASE_ADDRESS (d2.dr), 0)
2646 && compare_tree (d1.offset, d2.offset) == 0
2647 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2650 /* Function comp_dr_with_seg_len_pair.
2652 Comparison function for sorting objects of dr_with_seg_len_pair_t
2653 so that we can combine aliasing checks in one scan. */
2655 static int
2656 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2658 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2659 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2661 const dr_with_seg_len &p11 = p1->first,
2662 &p12 = p1->second,
2663 &p21 = p2->first,
2664 &p22 = p2->second;
2666 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2667 if a and c have the same basic address snd step, and b and d have the same
2668 address and step. Therefore, if any a&c or b&d don't have the same address
2669 and step, we don't care the order of those two pairs after sorting. */
2670 int comp_res;
2672 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2673 DR_BASE_ADDRESS (p21.dr))) != 0)
2674 return comp_res;
2675 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2676 DR_BASE_ADDRESS (p22.dr))) != 0)
2677 return comp_res;
2678 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2679 return comp_res;
2680 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2681 return comp_res;
2682 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2683 return comp_res;
2684 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2685 return comp_res;
2687 return 0;
2690 template <class T> static void
2691 swap (T& a, T& b)
2693 T c (a);
2694 a = b;
2695 b = c;
2698 /* Function vect_vfa_segment_size.
2700 Create an expression that computes the size of segment
2701 that will be accessed for a data reference. The functions takes into
2702 account that realignment loads may access one more vector.
2704 Input:
2705 DR: The data reference.
2706 LENGTH_FACTOR: segment length to consider.
2708 Return an expression whose value is the size of segment which will be
2709 accessed by DR. */
2711 static tree
2712 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2714 tree segment_length;
2716 if (integer_zerop (DR_STEP (dr)))
2717 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2718 else
2719 segment_length = size_binop (MULT_EXPR,
2720 fold_convert (sizetype, DR_STEP (dr)),
2721 fold_convert (sizetype, length_factor));
2723 if (vect_supportable_dr_alignment (dr, false)
2724 == dr_explicit_realign_optimized)
2726 tree vector_size = TYPE_SIZE_UNIT
2727 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2729 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2731 return segment_length;
2734 /* Function vect_prune_runtime_alias_test_list.
2736 Prune a list of ddrs to be tested at run-time by versioning for alias.
2737 Merge several alias checks into one if possible.
2738 Return FALSE if resulting list of ddrs is longer then allowed by
2739 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2741 bool
2742 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2744 vec<ddr_p> may_alias_ddrs =
2745 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2746 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2747 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2748 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2749 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2751 ddr_p ddr;
2752 unsigned int i;
2753 tree length_factor;
2755 if (dump_enabled_p ())
2756 dump_printf_loc (MSG_NOTE, vect_location,
2757 "=== vect_prune_runtime_alias_test_list ===\n");
2759 if (may_alias_ddrs.is_empty ())
2760 return true;
2762 /* Basically, for each pair of dependent data refs store_ptr_0
2763 and load_ptr_0, we create an expression:
2765 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2766 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2768 for aliasing checks. However, in some cases we can decrease
2769 the number of checks by combining two checks into one. For
2770 example, suppose we have another pair of data refs store_ptr_0
2771 and load_ptr_1, and if the following condition is satisfied:
2773 load_ptr_0 < load_ptr_1 &&
2774 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2776 (this condition means, in each iteration of vectorized loop,
2777 the accessed memory of store_ptr_0 cannot be between the memory
2778 of load_ptr_0 and load_ptr_1.)
2780 we then can use only the following expression to finish the
2781 alising checks between store_ptr_0 & load_ptr_0 and
2782 store_ptr_0 & load_ptr_1:
2784 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2785 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2787 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2788 same basic address. */
2790 comp_alias_ddrs.create (may_alias_ddrs.length ());
2792 /* First, we collect all data ref pairs for aliasing checks. */
2793 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2795 struct data_reference *dr_a, *dr_b;
2796 gimple dr_group_first_a, dr_group_first_b;
2797 tree segment_length_a, segment_length_b;
2798 gimple stmt_a, stmt_b;
2800 dr_a = DDR_A (ddr);
2801 stmt_a = DR_STMT (DDR_A (ddr));
2802 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2803 if (dr_group_first_a)
2805 stmt_a = dr_group_first_a;
2806 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2809 dr_b = DDR_B (ddr);
2810 stmt_b = DR_STMT (DDR_B (ddr));
2811 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2812 if (dr_group_first_b)
2814 stmt_b = dr_group_first_b;
2815 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2818 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2819 length_factor = scalar_loop_iters;
2820 else
2821 length_factor = size_int (vect_factor);
2822 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2823 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2825 dr_with_seg_len_pair_t dr_with_seg_len_pair
2826 (dr_with_seg_len (dr_a, segment_length_a),
2827 dr_with_seg_len (dr_b, segment_length_b));
2829 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2830 swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2832 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2835 /* Second, we sort the collected data ref pairs so that we can scan
2836 them once to combine all possible aliasing checks. */
2837 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2839 /* Third, we scan the sorted dr pairs and check if we can combine
2840 alias checks of two neighbouring dr pairs. */
2841 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2843 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2844 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2845 *dr_b1 = &comp_alias_ddrs[i-1].second,
2846 *dr_a2 = &comp_alias_ddrs[i].first,
2847 *dr_b2 = &comp_alias_ddrs[i].second;
2849 /* Remove duplicate data ref pairs. */
2850 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2852 if (dump_enabled_p ())
2854 dump_printf_loc (MSG_NOTE, vect_location,
2855 "found equal ranges ");
2856 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2857 DR_REF (dr_a1->dr));
2858 dump_printf (MSG_NOTE, ", ");
2859 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2860 DR_REF (dr_b1->dr));
2861 dump_printf (MSG_NOTE, " and ");
2862 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2863 DR_REF (dr_a2->dr));
2864 dump_printf (MSG_NOTE, ", ");
2865 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2866 DR_REF (dr_b2->dr));
2867 dump_printf (MSG_NOTE, "\n");
2870 comp_alias_ddrs.ordered_remove (i--);
2871 continue;
2874 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2876 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2877 and DR_A1 and DR_A2 are two consecutive memrefs. */
2878 if (*dr_a1 == *dr_a2)
2880 swap (dr_a1, dr_b1);
2881 swap (dr_a2, dr_b2);
2884 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2885 DR_BASE_ADDRESS (dr_a2->dr),
2887 || !tree_fits_shwi_p (dr_a1->offset)
2888 || !tree_fits_shwi_p (dr_a2->offset))
2889 continue;
2891 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2892 - tree_to_shwi (dr_a1->offset));
2895 /* Now we check if the following condition is satisfied:
2897 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2899 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2900 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2901 have to make a best estimation. We can get the minimum value
2902 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2903 then either of the following two conditions can guarantee the
2904 one above:
2906 1: DIFF <= MIN_SEG_LEN_B
2907 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2911 HOST_WIDE_INT
2912 min_seg_len_b = (TREE_CODE (dr_b1->seg_len) == INTEGER_CST) ?
2913 TREE_INT_CST_LOW (dr_b1->seg_len) :
2914 vect_factor;
2916 if (diff <= min_seg_len_b
2917 || (TREE_CODE (dr_a1->seg_len) == INTEGER_CST
2918 && diff - (HOST_WIDE_INT) TREE_INT_CST_LOW (dr_a1->seg_len) <
2919 min_seg_len_b))
2921 dr_a1->seg_len = size_binop (PLUS_EXPR,
2922 dr_a2->seg_len, size_int (diff));
2923 comp_alias_ddrs.ordered_remove (i--);
2928 if ((int) comp_alias_ddrs.length () >
2929 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2931 if (dump_enabled_p ())
2933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2934 "disable versioning for alias - max number of "
2935 "generated checks exceeded.\n");
2938 return false;
2941 return true;
2944 /* Check whether a non-affine read in stmt is suitable for gather load
2945 and if so, return a builtin decl for that operation. */
2947 tree
2948 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2949 tree *offp, int *scalep)
2951 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2952 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2953 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2954 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2955 tree offtype = NULL_TREE;
2956 tree decl, base, off;
2957 enum machine_mode pmode;
2958 int punsignedp, pvolatilep;
2960 /* The gather builtins need address of the form
2961 loop_invariant + vector * {1, 2, 4, 8}
2963 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2964 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2965 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2966 multiplications and additions in it. To get a vector, we need
2967 a single SSA_NAME that will be defined in the loop and will
2968 contain everything that is not loop invariant and that can be
2969 vectorized. The following code attempts to find such a preexistng
2970 SSA_NAME OFF and put the loop invariants into a tree BASE
2971 that can be gimplified before the loop. */
2972 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2973 &pmode, &punsignedp, &pvolatilep);
2974 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2976 if (TREE_CODE (base) == MEM_REF)
2978 if (!integer_zerop (TREE_OPERAND (base, 1)))
2980 if (off == NULL_TREE)
2982 double_int moff = mem_ref_offset (base);
2983 off = double_int_to_tree (sizetype, moff);
2985 else
2986 off = size_binop (PLUS_EXPR, off,
2987 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2989 base = TREE_OPERAND (base, 0);
2991 else
2992 base = build_fold_addr_expr (base);
2994 if (off == NULL_TREE)
2995 off = size_zero_node;
2997 /* If base is not loop invariant, either off is 0, then we start with just
2998 the constant offset in the loop invariant BASE and continue with base
2999 as OFF, otherwise give up.
3000 We could handle that case by gimplifying the addition of base + off
3001 into some SSA_NAME and use that as off, but for now punt. */
3002 if (!expr_invariant_in_loop_p (loop, base))
3004 if (!integer_zerop (off))
3005 return NULL_TREE;
3006 off = base;
3007 base = size_int (pbitpos / BITS_PER_UNIT);
3009 /* Otherwise put base + constant offset into the loop invariant BASE
3010 and continue with OFF. */
3011 else
3013 base = fold_convert (sizetype, base);
3014 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3017 /* OFF at this point may be either a SSA_NAME or some tree expression
3018 from get_inner_reference. Try to peel off loop invariants from it
3019 into BASE as long as possible. */
3020 STRIP_NOPS (off);
3021 while (offtype == NULL_TREE)
3023 enum tree_code code;
3024 tree op0, op1, add = NULL_TREE;
3026 if (TREE_CODE (off) == SSA_NAME)
3028 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3030 if (expr_invariant_in_loop_p (loop, off))
3031 return NULL_TREE;
3033 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3034 break;
3036 op0 = gimple_assign_rhs1 (def_stmt);
3037 code = gimple_assign_rhs_code (def_stmt);
3038 op1 = gimple_assign_rhs2 (def_stmt);
3040 else
3042 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3043 return NULL_TREE;
3044 code = TREE_CODE (off);
3045 extract_ops_from_tree (off, &code, &op0, &op1);
3047 switch (code)
3049 case POINTER_PLUS_EXPR:
3050 case PLUS_EXPR:
3051 if (expr_invariant_in_loop_p (loop, op0))
3053 add = op0;
3054 off = op1;
3055 do_add:
3056 add = fold_convert (sizetype, add);
3057 if (scale != 1)
3058 add = size_binop (MULT_EXPR, add, size_int (scale));
3059 base = size_binop (PLUS_EXPR, base, add);
3060 continue;
3062 if (expr_invariant_in_loop_p (loop, op1))
3064 add = op1;
3065 off = op0;
3066 goto do_add;
3068 break;
3069 case MINUS_EXPR:
3070 if (expr_invariant_in_loop_p (loop, op1))
3072 add = fold_convert (sizetype, op1);
3073 add = size_binop (MINUS_EXPR, size_zero_node, add);
3074 off = op0;
3075 goto do_add;
3077 break;
3078 case MULT_EXPR:
3079 if (scale == 1 && tree_fits_shwi_p (op1))
3081 scale = tree_to_shwi (op1);
3082 off = op0;
3083 continue;
3085 break;
3086 case SSA_NAME:
3087 off = op0;
3088 continue;
3089 CASE_CONVERT:
3090 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3091 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3092 break;
3093 if (TYPE_PRECISION (TREE_TYPE (op0))
3094 == TYPE_PRECISION (TREE_TYPE (off)))
3096 off = op0;
3097 continue;
3099 if (TYPE_PRECISION (TREE_TYPE (op0))
3100 < TYPE_PRECISION (TREE_TYPE (off)))
3102 off = op0;
3103 offtype = TREE_TYPE (off);
3104 STRIP_NOPS (off);
3105 continue;
3107 break;
3108 default:
3109 break;
3111 break;
3114 /* If at the end OFF still isn't a SSA_NAME or isn't
3115 defined in the loop, punt. */
3116 if (TREE_CODE (off) != SSA_NAME
3117 || expr_invariant_in_loop_p (loop, off))
3118 return NULL_TREE;
3120 if (offtype == NULL_TREE)
3121 offtype = TREE_TYPE (off);
3123 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3124 offtype, scale);
3125 if (decl == NULL_TREE)
3126 return NULL_TREE;
3128 if (basep)
3129 *basep = base;
3130 if (offp)
3131 *offp = off;
3132 if (scalep)
3133 *scalep = scale;
3134 return decl;
3137 /* Function vect_analyze_data_refs.
3139 Find all the data references in the loop or basic block.
3141 The general structure of the analysis of data refs in the vectorizer is as
3142 follows:
3143 1- vect_analyze_data_refs(loop/bb): call
3144 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3145 in the loop/bb and their dependences.
3146 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3147 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3148 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3152 bool
3153 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3154 bb_vec_info bb_vinfo,
3155 int *min_vf)
3157 struct loop *loop = NULL;
3158 basic_block bb = NULL;
3159 unsigned int i;
3160 vec<data_reference_p> datarefs;
3161 struct data_reference *dr;
3162 tree scalar_type;
3164 if (dump_enabled_p ())
3165 dump_printf_loc (MSG_NOTE, vect_location,
3166 "=== vect_analyze_data_refs ===\n");
3168 if (loop_vinfo)
3170 loop = LOOP_VINFO_LOOP (loop_vinfo);
3171 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))
3172 || find_data_references_in_loop
3173 (loop, &LOOP_VINFO_DATAREFS (loop_vinfo)))
3175 if (dump_enabled_p ())
3176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3177 "not vectorized: loop contains function calls"
3178 " or data references that cannot be analyzed\n");
3179 return false;
3182 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3184 else
3186 gimple_stmt_iterator gsi;
3188 bb = BB_VINFO_BB (bb_vinfo);
3189 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3191 gimple stmt = gsi_stmt (gsi);
3192 if (!find_data_references_in_stmt (NULL, stmt,
3193 &BB_VINFO_DATAREFS (bb_vinfo)))
3195 /* Mark the rest of the basic-block as unvectorizable. */
3196 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3198 stmt = gsi_stmt (gsi);
3199 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3201 break;
3205 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3208 /* Go through the data-refs, check that the analysis succeeded. Update
3209 pointer from stmt_vec_info struct to DR and vectype. */
3211 FOR_EACH_VEC_ELT (datarefs, i, dr)
3213 gimple stmt;
3214 stmt_vec_info stmt_info;
3215 tree base, offset, init;
3216 bool gather = false;
3217 bool simd_lane_access = false;
3218 int vf;
3220 again:
3221 if (!dr || !DR_REF (dr))
3223 if (dump_enabled_p ())
3224 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3225 "not vectorized: unhandled data-ref\n");
3226 return false;
3229 stmt = DR_STMT (dr);
3230 stmt_info = vinfo_for_stmt (stmt);
3232 /* Discard clobbers from the dataref vector. We will remove
3233 clobber stmts during vectorization. */
3234 if (gimple_clobber_p (stmt))
3236 if (i == datarefs.length () - 1)
3238 datarefs.pop ();
3239 break;
3241 datarefs[i] = datarefs.pop ();
3242 goto again;
3245 /* Check that analysis of the data-ref succeeded. */
3246 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3247 || !DR_STEP (dr))
3249 bool maybe_gather
3250 = DR_IS_READ (dr)
3251 && !TREE_THIS_VOLATILE (DR_REF (dr))
3252 && targetm.vectorize.builtin_gather != NULL;
3253 bool maybe_simd_lane_access
3254 = loop_vinfo && loop->simduid;
3256 /* If target supports vector gather loads, or if this might be
3257 a SIMD lane access, see if they can't be used. */
3258 if (loop_vinfo
3259 && (maybe_gather || maybe_simd_lane_access)
3260 && !nested_in_vect_loop_p (loop, stmt))
3262 struct data_reference *newdr
3263 = create_data_ref (NULL, loop_containing_stmt (stmt),
3264 DR_REF (dr), stmt, true);
3265 gcc_assert (newdr != NULL && DR_REF (newdr));
3266 if (DR_BASE_ADDRESS (newdr)
3267 && DR_OFFSET (newdr)
3268 && DR_INIT (newdr)
3269 && DR_STEP (newdr)
3270 && integer_zerop (DR_STEP (newdr)))
3272 if (maybe_simd_lane_access)
3274 tree off = DR_OFFSET (newdr);
3275 STRIP_NOPS (off);
3276 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3277 && TREE_CODE (off) == MULT_EXPR
3278 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3280 tree step = TREE_OPERAND (off, 1);
3281 off = TREE_OPERAND (off, 0);
3282 STRIP_NOPS (off);
3283 if (CONVERT_EXPR_P (off)
3284 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3285 0)))
3286 < TYPE_PRECISION (TREE_TYPE (off)))
3287 off = TREE_OPERAND (off, 0);
3288 if (TREE_CODE (off) == SSA_NAME)
3290 gimple def = SSA_NAME_DEF_STMT (off);
3291 tree reft = TREE_TYPE (DR_REF (newdr));
3292 if (gimple_call_internal_p (def)
3293 && gimple_call_internal_fn (def)
3294 == IFN_GOMP_SIMD_LANE)
3296 tree arg = gimple_call_arg (def, 0);
3297 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3298 arg = SSA_NAME_VAR (arg);
3299 if (arg == loop->simduid
3300 /* For now. */
3301 && tree_int_cst_equal
3302 (TYPE_SIZE_UNIT (reft),
3303 step))
3305 DR_OFFSET (newdr) = ssize_int (0);
3306 DR_STEP (newdr) = step;
3307 DR_ALIGNED_TO (newdr)
3308 = size_int (BIGGEST_ALIGNMENT);
3309 dr = newdr;
3310 simd_lane_access = true;
3316 if (!simd_lane_access && maybe_gather)
3318 dr = newdr;
3319 gather = true;
3322 if (!gather && !simd_lane_access)
3323 free_data_ref (newdr);
3326 if (!gather && !simd_lane_access)
3328 if (dump_enabled_p ())
3330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3331 "not vectorized: data ref analysis "
3332 "failed ");
3333 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3334 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3337 if (bb_vinfo)
3338 break;
3340 return false;
3344 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3346 if (dump_enabled_p ())
3347 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3348 "not vectorized: base addr of dr is a "
3349 "constant\n");
3351 if (bb_vinfo)
3352 break;
3354 if (gather || simd_lane_access)
3355 free_data_ref (dr);
3356 return false;
3359 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3361 if (dump_enabled_p ())
3363 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3364 "not vectorized: volatile type ");
3365 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3366 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3369 if (bb_vinfo)
3370 break;
3372 return false;
3375 if (stmt_can_throw_internal (stmt))
3377 if (dump_enabled_p ())
3379 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3380 "not vectorized: statement can throw an "
3381 "exception ");
3382 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3383 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3386 if (bb_vinfo)
3387 break;
3389 if (gather || simd_lane_access)
3390 free_data_ref (dr);
3391 return false;
3394 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3395 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3397 if (dump_enabled_p ())
3399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3400 "not vectorized: statement is bitfield "
3401 "access ");
3402 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3403 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3406 if (bb_vinfo)
3407 break;
3409 if (gather || simd_lane_access)
3410 free_data_ref (dr);
3411 return false;
3414 base = unshare_expr (DR_BASE_ADDRESS (dr));
3415 offset = unshare_expr (DR_OFFSET (dr));
3416 init = unshare_expr (DR_INIT (dr));
3418 if (is_gimple_call (stmt))
3420 if (dump_enabled_p ())
3422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3423 "not vectorized: dr in a call ");
3424 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3425 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3428 if (bb_vinfo)
3429 break;
3431 if (gather || simd_lane_access)
3432 free_data_ref (dr);
3433 return false;
3436 /* Update DR field in stmt_vec_info struct. */
3438 /* If the dataref is in an inner-loop of the loop that is considered for
3439 for vectorization, we also want to analyze the access relative to
3440 the outer-loop (DR contains information only relative to the
3441 inner-most enclosing loop). We do that by building a reference to the
3442 first location accessed by the inner-loop, and analyze it relative to
3443 the outer-loop. */
3444 if (loop && nested_in_vect_loop_p (loop, stmt))
3446 tree outer_step, outer_base, outer_init;
3447 HOST_WIDE_INT pbitsize, pbitpos;
3448 tree poffset;
3449 enum machine_mode pmode;
3450 int punsignedp, pvolatilep;
3451 affine_iv base_iv, offset_iv;
3452 tree dinit;
3454 /* Build a reference to the first location accessed by the
3455 inner-loop: *(BASE+INIT). (The first location is actually
3456 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3457 tree inner_base = build_fold_indirect_ref
3458 (fold_build_pointer_plus (base, init));
3460 if (dump_enabled_p ())
3462 dump_printf_loc (MSG_NOTE, vect_location,
3463 "analyze in outer-loop: ");
3464 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3465 dump_printf (MSG_NOTE, "\n");
3468 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3469 &poffset, &pmode, &punsignedp, &pvolatilep);
3470 gcc_assert (outer_base != NULL_TREE);
3472 if (pbitpos % BITS_PER_UNIT != 0)
3474 if (dump_enabled_p ())
3475 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3476 "failed: bit offset alignment.\n");
3477 return false;
3480 outer_base = build_fold_addr_expr (outer_base);
3481 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3482 &base_iv, false))
3484 if (dump_enabled_p ())
3485 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3486 "failed: evolution of base is not affine.\n");
3487 return false;
3490 if (offset)
3492 if (poffset)
3493 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3494 poffset);
3495 else
3496 poffset = offset;
3499 if (!poffset)
3501 offset_iv.base = ssize_int (0);
3502 offset_iv.step = ssize_int (0);
3504 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3505 &offset_iv, false))
3507 if (dump_enabled_p ())
3508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3509 "evolution of offset is not affine.\n");
3510 return false;
3513 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3514 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3515 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3516 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3517 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3519 outer_step = size_binop (PLUS_EXPR,
3520 fold_convert (ssizetype, base_iv.step),
3521 fold_convert (ssizetype, offset_iv.step));
3523 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3524 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3525 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3526 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3527 STMT_VINFO_DR_OFFSET (stmt_info) =
3528 fold_convert (ssizetype, offset_iv.base);
3529 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3530 size_int (highest_pow2_factor (offset_iv.base));
3532 if (dump_enabled_p ())
3534 dump_printf_loc (MSG_NOTE, vect_location,
3535 "\touter base_address: ");
3536 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3537 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3538 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3539 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3540 STMT_VINFO_DR_OFFSET (stmt_info));
3541 dump_printf (MSG_NOTE,
3542 "\n\touter constant offset from base address: ");
3543 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3544 STMT_VINFO_DR_INIT (stmt_info));
3545 dump_printf (MSG_NOTE, "\n\touter step: ");
3546 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3547 STMT_VINFO_DR_STEP (stmt_info));
3548 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3549 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3550 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3551 dump_printf (MSG_NOTE, "\n");
3555 if (STMT_VINFO_DATA_REF (stmt_info))
3557 if (dump_enabled_p ())
3559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3560 "not vectorized: more than one data ref "
3561 "in stmt: ");
3562 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3563 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3566 if (bb_vinfo)
3567 break;
3569 if (gather || simd_lane_access)
3570 free_data_ref (dr);
3571 return false;
3574 STMT_VINFO_DATA_REF (stmt_info) = dr;
3575 if (simd_lane_access)
3577 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3578 datarefs[i] = dr;
3581 /* Set vectype for STMT. */
3582 scalar_type = TREE_TYPE (DR_REF (dr));
3583 STMT_VINFO_VECTYPE (stmt_info) =
3584 get_vectype_for_scalar_type (scalar_type);
3585 if (!STMT_VINFO_VECTYPE (stmt_info))
3587 if (dump_enabled_p ())
3589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3590 "not vectorized: no vectype for stmt: ");
3591 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3592 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3593 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3594 scalar_type);
3595 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3598 if (bb_vinfo)
3599 break;
3601 if (gather || simd_lane_access)
3603 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3604 free_data_ref (dr);
3606 return false;
3608 else
3610 if (dump_enabled_p ())
3612 dump_printf_loc (MSG_NOTE, vect_location,
3613 "got vectype for stmt: ");
3614 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3615 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3616 STMT_VINFO_VECTYPE (stmt_info));
3617 dump_printf (MSG_NOTE, "\n");
3621 /* Adjust the minimal vectorization factor according to the
3622 vector type. */
3623 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3624 if (vf > *min_vf)
3625 *min_vf = vf;
3627 if (gather)
3629 tree off;
3631 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3632 if (gather
3633 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3634 gather = false;
3635 if (!gather)
3637 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3638 free_data_ref (dr);
3639 if (dump_enabled_p ())
3641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3642 "not vectorized: not suitable for gather "
3643 "load ");
3644 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3645 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3647 return false;
3650 datarefs[i] = dr;
3651 STMT_VINFO_GATHER_P (stmt_info) = true;
3653 else if (loop_vinfo
3654 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3656 if (nested_in_vect_loop_p (loop, stmt)
3657 || !DR_IS_READ (dr))
3659 if (dump_enabled_p ())
3661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3662 "not vectorized: not suitable for strided "
3663 "load ");
3664 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3665 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3667 return false;
3669 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3673 /* If we stopped analysis at the first dataref we could not analyze
3674 when trying to vectorize a basic-block mark the rest of the datarefs
3675 as not vectorizable and truncate the vector of datarefs. That
3676 avoids spending useless time in analyzing their dependence. */
3677 if (i != datarefs.length ())
3679 gcc_assert (bb_vinfo != NULL);
3680 for (unsigned j = i; j < datarefs.length (); ++j)
3682 data_reference_p dr = datarefs[j];
3683 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3684 free_data_ref (dr);
3686 datarefs.truncate (i);
3689 return true;
3693 /* Function vect_get_new_vect_var.
3695 Returns a name for a new variable. The current naming scheme appends the
3696 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3697 the name of vectorizer generated variables, and appends that to NAME if
3698 provided. */
3700 tree
3701 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3703 const char *prefix;
3704 tree new_vect_var;
3706 switch (var_kind)
3708 case vect_simple_var:
3709 prefix = "vect";
3710 break;
3711 case vect_scalar_var:
3712 prefix = "stmp";
3713 break;
3714 case vect_pointer_var:
3715 prefix = "vectp";
3716 break;
3717 default:
3718 gcc_unreachable ();
3721 if (name)
3723 char* tmp = concat (prefix, "_", name, NULL);
3724 new_vect_var = create_tmp_reg (type, tmp);
3725 free (tmp);
3727 else
3728 new_vect_var = create_tmp_reg (type, prefix);
3730 return new_vect_var;
3734 /* Function vect_create_addr_base_for_vector_ref.
3736 Create an expression that computes the address of the first memory location
3737 that will be accessed for a data reference.
3739 Input:
3740 STMT: The statement containing the data reference.
3741 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3742 OFFSET: Optional. If supplied, it is be added to the initial address.
3743 LOOP: Specify relative to which loop-nest should the address be computed.
3744 For example, when the dataref is in an inner-loop nested in an
3745 outer-loop that is now being vectorized, LOOP can be either the
3746 outer-loop, or the inner-loop. The first memory location accessed
3747 by the following dataref ('in' points to short):
3749 for (i=0; i<N; i++)
3750 for (j=0; j<M; j++)
3751 s += in[i+j]
3753 is as follows:
3754 if LOOP=i_loop: &in (relative to i_loop)
3755 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3757 Output:
3758 1. Return an SSA_NAME whose value is the address of the memory location of
3759 the first vector of the data reference.
3760 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3761 these statement(s) which define the returned SSA_NAME.
3763 FORNOW: We are only handling array accesses with step 1. */
3765 tree
3766 vect_create_addr_base_for_vector_ref (gimple stmt,
3767 gimple_seq *new_stmt_list,
3768 tree offset,
3769 struct loop *loop)
3771 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3772 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3773 tree data_ref_base;
3774 const char *base_name;
3775 tree addr_base;
3776 tree dest;
3777 gimple_seq seq = NULL;
3778 tree base_offset;
3779 tree init;
3780 tree vect_ptr_type;
3781 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3782 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3784 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3786 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3788 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3790 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3791 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3792 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3794 else
3796 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3797 base_offset = unshare_expr (DR_OFFSET (dr));
3798 init = unshare_expr (DR_INIT (dr));
3801 if (loop_vinfo)
3802 base_name = get_name (data_ref_base);
3803 else
3805 base_offset = ssize_int (0);
3806 init = ssize_int (0);
3807 base_name = get_name (DR_REF (dr));
3810 /* Create base_offset */
3811 base_offset = size_binop (PLUS_EXPR,
3812 fold_convert (sizetype, base_offset),
3813 fold_convert (sizetype, init));
3815 if (offset)
3817 offset = fold_build2 (MULT_EXPR, sizetype,
3818 fold_convert (sizetype, offset), step);
3819 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3820 base_offset, offset);
3823 /* base + base_offset */
3824 if (loop_vinfo)
3825 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3826 else
3828 addr_base = build1 (ADDR_EXPR,
3829 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3830 unshare_expr (DR_REF (dr)));
3833 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3834 addr_base = fold_convert (vect_ptr_type, addr_base);
3835 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3836 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3837 gimple_seq_add_seq (new_stmt_list, seq);
3839 if (DR_PTR_INFO (dr)
3840 && TREE_CODE (addr_base) == SSA_NAME)
3842 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3843 if (offset)
3844 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3847 if (dump_enabled_p ())
3849 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3850 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3851 dump_printf (MSG_NOTE, "\n");
3854 return addr_base;
3858 /* Function vect_create_data_ref_ptr.
3860 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3861 location accessed in the loop by STMT, along with the def-use update
3862 chain to appropriately advance the pointer through the loop iterations.
3863 Also set aliasing information for the pointer. This pointer is used by
3864 the callers to this function to create a memory reference expression for
3865 vector load/store access.
3867 Input:
3868 1. STMT: a stmt that references memory. Expected to be of the form
3869 GIMPLE_ASSIGN <name, data-ref> or
3870 GIMPLE_ASSIGN <data-ref, name>.
3871 2. AGGR_TYPE: the type of the reference, which should be either a vector
3872 or an array.
3873 3. AT_LOOP: the loop where the vector memref is to be created.
3874 4. OFFSET (optional): an offset to be added to the initial address accessed
3875 by the data-ref in STMT.
3876 5. BSI: location where the new stmts are to be placed if there is no loop
3877 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3878 pointing to the initial address.
3880 Output:
3881 1. Declare a new ptr to vector_type, and have it point to the base of the
3882 data reference (initial addressed accessed by the data reference).
3883 For example, for vector of type V8HI, the following code is generated:
3885 v8hi *ap;
3886 ap = (v8hi *)initial_address;
3888 if OFFSET is not supplied:
3889 initial_address = &a[init];
3890 if OFFSET is supplied:
3891 initial_address = &a[init + OFFSET];
3893 Return the initial_address in INITIAL_ADDRESS.
3895 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3896 update the pointer in each iteration of the loop.
3898 Return the increment stmt that updates the pointer in PTR_INCR.
3900 3. Set INV_P to true if the access pattern of the data reference in the
3901 vectorized loop is invariant. Set it to false otherwise.
3903 4. Return the pointer. */
3905 tree
3906 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3907 tree offset, tree *initial_address,
3908 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3909 bool only_init, bool *inv_p)
3911 const char *base_name;
3912 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3913 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3914 struct loop *loop = NULL;
3915 bool nested_in_vect_loop = false;
3916 struct loop *containing_loop = NULL;
3917 tree aggr_ptr_type;
3918 tree aggr_ptr;
3919 tree new_temp;
3920 gimple vec_stmt;
3921 gimple_seq new_stmt_list = NULL;
3922 edge pe = NULL;
3923 basic_block new_bb;
3924 tree aggr_ptr_init;
3925 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3926 tree aptr;
3927 gimple_stmt_iterator incr_gsi;
3928 bool insert_after;
3929 tree indx_before_incr, indx_after_incr;
3930 gimple incr;
3931 tree step;
3932 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3934 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3935 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3937 if (loop_vinfo)
3939 loop = LOOP_VINFO_LOOP (loop_vinfo);
3940 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3941 containing_loop = (gimple_bb (stmt))->loop_father;
3942 pe = loop_preheader_edge (loop);
3944 else
3946 gcc_assert (bb_vinfo);
3947 only_init = true;
3948 *ptr_incr = NULL;
3951 /* Check the step (evolution) of the load in LOOP, and record
3952 whether it's invariant. */
3953 if (nested_in_vect_loop)
3954 step = STMT_VINFO_DR_STEP (stmt_info);
3955 else
3956 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3958 if (integer_zerop (step))
3959 *inv_p = true;
3960 else
3961 *inv_p = false;
3963 /* Create an expression for the first address accessed by this load
3964 in LOOP. */
3965 base_name = get_name (DR_BASE_ADDRESS (dr));
3967 if (dump_enabled_p ())
3969 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
3970 dump_printf_loc (MSG_NOTE, vect_location,
3971 "create %s-pointer variable to type: ",
3972 get_tree_code_name (TREE_CODE (aggr_type)));
3973 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3974 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
3975 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
3976 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
3977 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
3978 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
3979 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
3980 else
3981 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3982 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
3983 dump_printf (MSG_NOTE, "\n");
3986 /* (1) Create the new aggregate-pointer variable.
3987 Vector and array types inherit the alias set of their component
3988 type by default so we need to use a ref-all pointer if the data
3989 reference does not conflict with the created aggregated data
3990 reference because it is not addressable. */
3991 bool need_ref_all = false;
3992 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3993 get_alias_set (DR_REF (dr))))
3994 need_ref_all = true;
3995 /* Likewise for any of the data references in the stmt group. */
3996 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
3998 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4001 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4002 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4003 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4004 get_alias_set (DR_REF (sdr))))
4006 need_ref_all = true;
4007 break;
4009 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4011 while (orig_stmt);
4013 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4014 need_ref_all);
4015 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4018 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4019 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4020 def-use update cycles for the pointer: one relative to the outer-loop
4021 (LOOP), which is what steps (3) and (4) below do. The other is relative
4022 to the inner-loop (which is the inner-most loop containing the dataref),
4023 and this is done be step (5) below.
4025 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4026 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4027 redundant. Steps (3),(4) create the following:
4029 vp0 = &base_addr;
4030 LOOP: vp1 = phi(vp0,vp2)
4033 vp2 = vp1 + step
4034 goto LOOP
4036 If there is an inner-loop nested in loop, then step (5) will also be
4037 applied, and an additional update in the inner-loop will be created:
4039 vp0 = &base_addr;
4040 LOOP: vp1 = phi(vp0,vp2)
4042 inner: vp3 = phi(vp1,vp4)
4043 vp4 = vp3 + inner_step
4044 if () goto inner
4046 vp2 = vp1 + step
4047 if () goto LOOP */
4049 /* (2) Calculate the initial address of the aggregate-pointer, and set
4050 the aggregate-pointer to point to it before the loop. */
4052 /* Create: (&(base[init_val+offset]) in the loop preheader. */
4054 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4055 offset, loop);
4056 if (new_stmt_list)
4058 if (pe)
4060 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4061 gcc_assert (!new_bb);
4063 else
4064 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4067 *initial_address = new_temp;
4069 /* Create: p = (aggr_type *) initial_base */
4070 if (TREE_CODE (new_temp) != SSA_NAME
4071 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4073 vec_stmt = gimple_build_assign (aggr_ptr,
4074 fold_convert (aggr_ptr_type, new_temp));
4075 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4076 /* Copy the points-to information if it exists. */
4077 if (DR_PTR_INFO (dr))
4078 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4079 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4080 if (pe)
4082 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4083 gcc_assert (!new_bb);
4085 else
4086 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4088 else
4089 aggr_ptr_init = new_temp;
4091 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4092 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4093 inner-loop nested in LOOP (during outer-loop vectorization). */
4095 /* No update in loop is required. */
4096 if (only_init && (!loop_vinfo || at_loop == loop))
4097 aptr = aggr_ptr_init;
4098 else
4100 /* The step of the aggregate pointer is the type size. */
4101 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4102 /* One exception to the above is when the scalar step of the load in
4103 LOOP is zero. In this case the step here is also zero. */
4104 if (*inv_p)
4105 iv_step = size_zero_node;
4106 else if (tree_int_cst_sgn (step) == -1)
4107 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4109 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4111 create_iv (aggr_ptr_init,
4112 fold_convert (aggr_ptr_type, iv_step),
4113 aggr_ptr, loop, &incr_gsi, insert_after,
4114 &indx_before_incr, &indx_after_incr);
4115 incr = gsi_stmt (incr_gsi);
4116 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4118 /* Copy the points-to information if it exists. */
4119 if (DR_PTR_INFO (dr))
4121 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4122 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4124 if (ptr_incr)
4125 *ptr_incr = incr;
4127 aptr = indx_before_incr;
4130 if (!nested_in_vect_loop || only_init)
4131 return aptr;
4134 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4135 nested in LOOP, if exists. */
4137 gcc_assert (nested_in_vect_loop);
4138 if (!only_init)
4140 standard_iv_increment_position (containing_loop, &incr_gsi,
4141 &insert_after);
4142 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4143 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4144 &indx_after_incr);
4145 incr = gsi_stmt (incr_gsi);
4146 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4148 /* Copy the points-to information if it exists. */
4149 if (DR_PTR_INFO (dr))
4151 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4152 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4154 if (ptr_incr)
4155 *ptr_incr = incr;
4157 return indx_before_incr;
4159 else
4160 gcc_unreachable ();
4164 /* Function bump_vector_ptr
4166 Increment a pointer (to a vector type) by vector-size. If requested,
4167 i.e. if PTR-INCR is given, then also connect the new increment stmt
4168 to the existing def-use update-chain of the pointer, by modifying
4169 the PTR_INCR as illustrated below:
4171 The pointer def-use update-chain before this function:
4172 DATAREF_PTR = phi (p_0, p_2)
4173 ....
4174 PTR_INCR: p_2 = DATAREF_PTR + step
4176 The pointer def-use update-chain after this function:
4177 DATAREF_PTR = phi (p_0, p_2)
4178 ....
4179 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4180 ....
4181 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4183 Input:
4184 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4185 in the loop.
4186 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4187 the loop. The increment amount across iterations is expected
4188 to be vector_size.
4189 BSI - location where the new update stmt is to be placed.
4190 STMT - the original scalar memory-access stmt that is being vectorized.
4191 BUMP - optional. The offset by which to bump the pointer. If not given,
4192 the offset is assumed to be vector_size.
4194 Output: Return NEW_DATAREF_PTR as illustrated above.
4198 tree
4199 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4200 gimple stmt, tree bump)
4202 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4203 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4204 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4205 tree update = TYPE_SIZE_UNIT (vectype);
4206 gimple incr_stmt;
4207 ssa_op_iter iter;
4208 use_operand_p use_p;
4209 tree new_dataref_ptr;
4211 if (bump)
4212 update = bump;
4214 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4215 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4216 dataref_ptr, update);
4217 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4219 /* Copy the points-to information if it exists. */
4220 if (DR_PTR_INFO (dr))
4222 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4223 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4226 if (!ptr_incr)
4227 return new_dataref_ptr;
4229 /* Update the vector-pointer's cross-iteration increment. */
4230 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4232 tree use = USE_FROM_PTR (use_p);
4234 if (use == dataref_ptr)
4235 SET_USE (use_p, new_dataref_ptr);
4236 else
4237 gcc_assert (tree_int_cst_compare (use, update) == 0);
4240 return new_dataref_ptr;
4244 /* Function vect_create_destination_var.
4246 Create a new temporary of type VECTYPE. */
4248 tree
4249 vect_create_destination_var (tree scalar_dest, tree vectype)
4251 tree vec_dest;
4252 const char *name;
4253 char *new_name;
4254 tree type;
4255 enum vect_var_kind kind;
4257 kind = vectype ? vect_simple_var : vect_scalar_var;
4258 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4260 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4262 name = get_name (scalar_dest);
4263 if (name)
4264 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4265 else
4266 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
4267 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4268 free (new_name);
4270 return vec_dest;
4273 /* Function vect_grouped_store_supported.
4275 Returns TRUE if interleave high and interleave low permutations
4276 are supported, and FALSE otherwise. */
4278 bool
4279 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4281 enum machine_mode mode = TYPE_MODE (vectype);
4283 /* vect_permute_store_chain requires the group size to be a power of two. */
4284 if (exact_log2 (count) == -1)
4286 if (dump_enabled_p ())
4287 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4288 "the size of the group of accesses"
4289 " is not a power of 2\n");
4290 return false;
4293 /* Check that the permutation is supported. */
4294 if (VECTOR_MODE_P (mode))
4296 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4297 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4298 for (i = 0; i < nelt / 2; i++)
4300 sel[i * 2] = i;
4301 sel[i * 2 + 1] = i + nelt;
4303 if (can_vec_perm_p (mode, false, sel))
4305 for (i = 0; i < nelt; i++)
4306 sel[i] += nelt / 2;
4307 if (can_vec_perm_p (mode, false, sel))
4308 return true;
4312 if (dump_enabled_p ())
4313 dump_printf (MSG_MISSED_OPTIMIZATION,
4314 "interleave op not supported by target.\n");
4315 return false;
4319 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4320 type VECTYPE. */
4322 bool
4323 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4325 return vect_lanes_optab_supported_p ("vec_store_lanes",
4326 vec_store_lanes_optab,
4327 vectype, count);
4331 /* Function vect_permute_store_chain.
4333 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4334 a power of 2, generate interleave_high/low stmts to reorder the data
4335 correctly for the stores. Return the final references for stores in
4336 RESULT_CHAIN.
4338 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4339 The input is 4 vectors each containing 8 elements. We assign a number to
4340 each element, the input sequence is:
4342 1st vec: 0 1 2 3 4 5 6 7
4343 2nd vec: 8 9 10 11 12 13 14 15
4344 3rd vec: 16 17 18 19 20 21 22 23
4345 4th vec: 24 25 26 27 28 29 30 31
4347 The output sequence should be:
4349 1st vec: 0 8 16 24 1 9 17 25
4350 2nd vec: 2 10 18 26 3 11 19 27
4351 3rd vec: 4 12 20 28 5 13 21 30
4352 4th vec: 6 14 22 30 7 15 23 31
4354 i.e., we interleave the contents of the four vectors in their order.
4356 We use interleave_high/low instructions to create such output. The input of
4357 each interleave_high/low operation is two vectors:
4358 1st vec 2nd vec
4359 0 1 2 3 4 5 6 7
4360 the even elements of the result vector are obtained left-to-right from the
4361 high/low elements of the first vector. The odd elements of the result are
4362 obtained left-to-right from the high/low elements of the second vector.
4363 The output of interleave_high will be: 0 4 1 5
4364 and of interleave_low: 2 6 3 7
4367 The permutation is done in log LENGTH stages. In each stage interleave_high
4368 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4369 where the first argument is taken from the first half of DR_CHAIN and the
4370 second argument from it's second half.
4371 In our example,
4373 I1: interleave_high (1st vec, 3rd vec)
4374 I2: interleave_low (1st vec, 3rd vec)
4375 I3: interleave_high (2nd vec, 4th vec)
4376 I4: interleave_low (2nd vec, 4th vec)
4378 The output for the first stage is:
4380 I1: 0 16 1 17 2 18 3 19
4381 I2: 4 20 5 21 6 22 7 23
4382 I3: 8 24 9 25 10 26 11 27
4383 I4: 12 28 13 29 14 30 15 31
4385 The output of the second stage, i.e. the final result is:
4387 I1: 0 8 16 24 1 9 17 25
4388 I2: 2 10 18 26 3 11 19 27
4389 I3: 4 12 20 28 5 13 21 30
4390 I4: 6 14 22 30 7 15 23 31. */
4392 void
4393 vect_permute_store_chain (vec<tree> dr_chain,
4394 unsigned int length,
4395 gimple stmt,
4396 gimple_stmt_iterator *gsi,
4397 vec<tree> *result_chain)
4399 tree vect1, vect2, high, low;
4400 gimple perm_stmt;
4401 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4402 tree perm_mask_low, perm_mask_high;
4403 unsigned int i, n;
4404 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4405 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4407 result_chain->quick_grow (length);
4408 memcpy (result_chain->address (), dr_chain.address (),
4409 length * sizeof (tree));
4411 for (i = 0, n = nelt / 2; i < n; i++)
4413 sel[i * 2] = i;
4414 sel[i * 2 + 1] = i + nelt;
4416 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4417 gcc_assert (perm_mask_high != NULL);
4419 for (i = 0; i < nelt; i++)
4420 sel[i] += nelt / 2;
4421 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4422 gcc_assert (perm_mask_low != NULL);
4424 for (i = 0, n = exact_log2 (length); i < n; i++)
4426 for (j = 0; j < length/2; j++)
4428 vect1 = dr_chain[j];
4429 vect2 = dr_chain[j+length/2];
4431 /* Create interleaving stmt:
4432 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4433 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4434 perm_stmt
4435 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4436 vect1, vect2, perm_mask_high);
4437 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4438 (*result_chain)[2*j] = high;
4440 /* Create interleaving stmt:
4441 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4442 nelt*3/2+1, ...}> */
4443 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4444 perm_stmt
4445 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4446 vect1, vect2, perm_mask_low);
4447 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4448 (*result_chain)[2*j+1] = low;
4450 memcpy (dr_chain.address (), result_chain->address (),
4451 length * sizeof (tree));
4455 /* Function vect_setup_realignment
4457 This function is called when vectorizing an unaligned load using
4458 the dr_explicit_realign[_optimized] scheme.
4459 This function generates the following code at the loop prolog:
4461 p = initial_addr;
4462 x msq_init = *(floor(p)); # prolog load
4463 realignment_token = call target_builtin;
4464 loop:
4465 x msq = phi (msq_init, ---)
4467 The stmts marked with x are generated only for the case of
4468 dr_explicit_realign_optimized.
4470 The code above sets up a new (vector) pointer, pointing to the first
4471 location accessed by STMT, and a "floor-aligned" load using that pointer.
4472 It also generates code to compute the "realignment-token" (if the relevant
4473 target hook was defined), and creates a phi-node at the loop-header bb
4474 whose arguments are the result of the prolog-load (created by this
4475 function) and the result of a load that takes place in the loop (to be
4476 created by the caller to this function).
4478 For the case of dr_explicit_realign_optimized:
4479 The caller to this function uses the phi-result (msq) to create the
4480 realignment code inside the loop, and sets up the missing phi argument,
4481 as follows:
4482 loop:
4483 msq = phi (msq_init, lsq)
4484 lsq = *(floor(p')); # load in loop
4485 result = realign_load (msq, lsq, realignment_token);
4487 For the case of dr_explicit_realign:
4488 loop:
4489 msq = *(floor(p)); # load in loop
4490 p' = p + (VS-1);
4491 lsq = *(floor(p')); # load in loop
4492 result = realign_load (msq, lsq, realignment_token);
4494 Input:
4495 STMT - (scalar) load stmt to be vectorized. This load accesses
4496 a memory location that may be unaligned.
4497 BSI - place where new code is to be inserted.
4498 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4499 is used.
4501 Output:
4502 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4503 target hook, if defined.
4504 Return value - the result of the loop-header phi node. */
4506 tree
4507 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4508 tree *realignment_token,
4509 enum dr_alignment_support alignment_support_scheme,
4510 tree init_addr,
4511 struct loop **at_loop)
4513 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4514 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4515 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4516 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4517 struct loop *loop = NULL;
4518 edge pe = NULL;
4519 tree scalar_dest = gimple_assign_lhs (stmt);
4520 tree vec_dest;
4521 gimple inc;
4522 tree ptr;
4523 tree data_ref;
4524 gimple new_stmt;
4525 basic_block new_bb;
4526 tree msq_init = NULL_TREE;
4527 tree new_temp;
4528 gimple phi_stmt;
4529 tree msq = NULL_TREE;
4530 gimple_seq stmts = NULL;
4531 bool inv_p;
4532 bool compute_in_loop = false;
4533 bool nested_in_vect_loop = false;
4534 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4535 struct loop *loop_for_initial_load = NULL;
4537 if (loop_vinfo)
4539 loop = LOOP_VINFO_LOOP (loop_vinfo);
4540 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4543 gcc_assert (alignment_support_scheme == dr_explicit_realign
4544 || alignment_support_scheme == dr_explicit_realign_optimized);
4546 /* We need to generate three things:
4547 1. the misalignment computation
4548 2. the extra vector load (for the optimized realignment scheme).
4549 3. the phi node for the two vectors from which the realignment is
4550 done (for the optimized realignment scheme). */
4552 /* 1. Determine where to generate the misalignment computation.
4554 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4555 calculation will be generated by this function, outside the loop (in the
4556 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4557 caller, inside the loop.
4559 Background: If the misalignment remains fixed throughout the iterations of
4560 the loop, then both realignment schemes are applicable, and also the
4561 misalignment computation can be done outside LOOP. This is because we are
4562 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4563 are a multiple of VS (the Vector Size), and therefore the misalignment in
4564 different vectorized LOOP iterations is always the same.
4565 The problem arises only if the memory access is in an inner-loop nested
4566 inside LOOP, which is now being vectorized using outer-loop vectorization.
4567 This is the only case when the misalignment of the memory access may not
4568 remain fixed throughout the iterations of the inner-loop (as explained in
4569 detail in vect_supportable_dr_alignment). In this case, not only is the
4570 optimized realignment scheme not applicable, but also the misalignment
4571 computation (and generation of the realignment token that is passed to
4572 REALIGN_LOAD) have to be done inside the loop.
4574 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4575 or not, which in turn determines if the misalignment is computed inside
4576 the inner-loop, or outside LOOP. */
4578 if (init_addr != NULL_TREE || !loop_vinfo)
4580 compute_in_loop = true;
4581 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4585 /* 2. Determine where to generate the extra vector load.
4587 For the optimized realignment scheme, instead of generating two vector
4588 loads in each iteration, we generate a single extra vector load in the
4589 preheader of the loop, and in each iteration reuse the result of the
4590 vector load from the previous iteration. In case the memory access is in
4591 an inner-loop nested inside LOOP, which is now being vectorized using
4592 outer-loop vectorization, we need to determine whether this initial vector
4593 load should be generated at the preheader of the inner-loop, or can be
4594 generated at the preheader of LOOP. If the memory access has no evolution
4595 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4596 to be generated inside LOOP (in the preheader of the inner-loop). */
4598 if (nested_in_vect_loop)
4600 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4601 bool invariant_in_outerloop =
4602 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4603 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4605 else
4606 loop_for_initial_load = loop;
4607 if (at_loop)
4608 *at_loop = loop_for_initial_load;
4610 if (loop_for_initial_load)
4611 pe = loop_preheader_edge (loop_for_initial_load);
4613 /* 3. For the case of the optimized realignment, create the first vector
4614 load at the loop preheader. */
4616 if (alignment_support_scheme == dr_explicit_realign_optimized)
4618 /* Create msq_init = *(floor(p1)) in the loop preheader */
4620 gcc_assert (!compute_in_loop);
4621 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4622 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4623 NULL_TREE, &init_addr, NULL, &inc,
4624 true, &inv_p);
4625 new_temp = copy_ssa_name (ptr, NULL);
4626 new_stmt = gimple_build_assign_with_ops
4627 (BIT_AND_EXPR, new_temp, ptr,
4628 build_int_cst (TREE_TYPE (ptr),
4629 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4630 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4631 gcc_assert (!new_bb);
4632 data_ref
4633 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4634 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4635 new_stmt = gimple_build_assign (vec_dest, data_ref);
4636 new_temp = make_ssa_name (vec_dest, new_stmt);
4637 gimple_assign_set_lhs (new_stmt, new_temp);
4638 if (pe)
4640 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4641 gcc_assert (!new_bb);
4643 else
4644 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4646 msq_init = gimple_assign_lhs (new_stmt);
4649 /* 4. Create realignment token using a target builtin, if available.
4650 It is done either inside the containing loop, or before LOOP (as
4651 determined above). */
4653 if (targetm.vectorize.builtin_mask_for_load)
4655 tree builtin_decl;
4657 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4658 if (!init_addr)
4660 /* Generate the INIT_ADDR computation outside LOOP. */
4661 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4662 NULL_TREE, loop);
4663 if (loop)
4665 pe = loop_preheader_edge (loop);
4666 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4667 gcc_assert (!new_bb);
4669 else
4670 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4673 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4674 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4675 vec_dest =
4676 vect_create_destination_var (scalar_dest,
4677 gimple_call_return_type (new_stmt));
4678 new_temp = make_ssa_name (vec_dest, new_stmt);
4679 gimple_call_set_lhs (new_stmt, new_temp);
4681 if (compute_in_loop)
4682 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4683 else
4685 /* Generate the misalignment computation outside LOOP. */
4686 pe = loop_preheader_edge (loop);
4687 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4688 gcc_assert (!new_bb);
4691 *realignment_token = gimple_call_lhs (new_stmt);
4693 /* The result of the CALL_EXPR to this builtin is determined from
4694 the value of the parameter and no global variables are touched
4695 which makes the builtin a "const" function. Requiring the
4696 builtin to have the "const" attribute makes it unnecessary
4697 to call mark_call_clobbered. */
4698 gcc_assert (TREE_READONLY (builtin_decl));
4701 if (alignment_support_scheme == dr_explicit_realign)
4702 return msq;
4704 gcc_assert (!compute_in_loop);
4705 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4708 /* 5. Create msq = phi <msq_init, lsq> in loop */
4710 pe = loop_preheader_edge (containing_loop);
4711 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4712 msq = make_ssa_name (vec_dest, NULL);
4713 phi_stmt = create_phi_node (msq, containing_loop->header);
4714 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4716 return msq;
4720 /* Function vect_grouped_load_supported.
4722 Returns TRUE if even and odd permutations are supported,
4723 and FALSE otherwise. */
4725 bool
4726 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4728 enum machine_mode mode = TYPE_MODE (vectype);
4730 /* vect_permute_load_chain requires the group size to be a power of two. */
4731 if (exact_log2 (count) == -1)
4733 if (dump_enabled_p ())
4734 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4735 "the size of the group of accesses"
4736 " is not a power of 2\n");
4737 return false;
4740 /* Check that the permutation is supported. */
4741 if (VECTOR_MODE_P (mode))
4743 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4744 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4746 for (i = 0; i < nelt; i++)
4747 sel[i] = i * 2;
4748 if (can_vec_perm_p (mode, false, sel))
4750 for (i = 0; i < nelt; i++)
4751 sel[i] = i * 2 + 1;
4752 if (can_vec_perm_p (mode, false, sel))
4753 return true;
4757 if (dump_enabled_p ())
4758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4759 "extract even/odd not supported by target\n");
4760 return false;
4763 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4764 type VECTYPE. */
4766 bool
4767 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4769 return vect_lanes_optab_supported_p ("vec_load_lanes",
4770 vec_load_lanes_optab,
4771 vectype, count);
4774 /* Function vect_permute_load_chain.
4776 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4777 a power of 2, generate extract_even/odd stmts to reorder the input data
4778 correctly. Return the final references for loads in RESULT_CHAIN.
4780 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4781 The input is 4 vectors each containing 8 elements. We assign a number to each
4782 element, the input sequence is:
4784 1st vec: 0 1 2 3 4 5 6 7
4785 2nd vec: 8 9 10 11 12 13 14 15
4786 3rd vec: 16 17 18 19 20 21 22 23
4787 4th vec: 24 25 26 27 28 29 30 31
4789 The output sequence should be:
4791 1st vec: 0 4 8 12 16 20 24 28
4792 2nd vec: 1 5 9 13 17 21 25 29
4793 3rd vec: 2 6 10 14 18 22 26 30
4794 4th vec: 3 7 11 15 19 23 27 31
4796 i.e., the first output vector should contain the first elements of each
4797 interleaving group, etc.
4799 We use extract_even/odd instructions to create such output. The input of
4800 each extract_even/odd operation is two vectors
4801 1st vec 2nd vec
4802 0 1 2 3 4 5 6 7
4804 and the output is the vector of extracted even/odd elements. The output of
4805 extract_even will be: 0 2 4 6
4806 and of extract_odd: 1 3 5 7
4809 The permutation is done in log LENGTH stages. In each stage extract_even
4810 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4811 their order. In our example,
4813 E1: extract_even (1st vec, 2nd vec)
4814 E2: extract_odd (1st vec, 2nd vec)
4815 E3: extract_even (3rd vec, 4th vec)
4816 E4: extract_odd (3rd vec, 4th vec)
4818 The output for the first stage will be:
4820 E1: 0 2 4 6 8 10 12 14
4821 E2: 1 3 5 7 9 11 13 15
4822 E3: 16 18 20 22 24 26 28 30
4823 E4: 17 19 21 23 25 27 29 31
4825 In order to proceed and create the correct sequence for the next stage (or
4826 for the correct output, if the second stage is the last one, as in our
4827 example), we first put the output of extract_even operation and then the
4828 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4829 The input for the second stage is:
4831 1st vec (E1): 0 2 4 6 8 10 12 14
4832 2nd vec (E3): 16 18 20 22 24 26 28 30
4833 3rd vec (E2): 1 3 5 7 9 11 13 15
4834 4th vec (E4): 17 19 21 23 25 27 29 31
4836 The output of the second stage:
4838 E1: 0 4 8 12 16 20 24 28
4839 E2: 2 6 10 14 18 22 26 30
4840 E3: 1 5 9 13 17 21 25 29
4841 E4: 3 7 11 15 19 23 27 31
4843 And RESULT_CHAIN after reordering:
4845 1st vec (E1): 0 4 8 12 16 20 24 28
4846 2nd vec (E3): 1 5 9 13 17 21 25 29
4847 3rd vec (E2): 2 6 10 14 18 22 26 30
4848 4th vec (E4): 3 7 11 15 19 23 27 31. */
4850 static void
4851 vect_permute_load_chain (vec<tree> dr_chain,
4852 unsigned int length,
4853 gimple stmt,
4854 gimple_stmt_iterator *gsi,
4855 vec<tree> *result_chain)
4857 tree data_ref, first_vect, second_vect;
4858 tree perm_mask_even, perm_mask_odd;
4859 gimple perm_stmt;
4860 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4861 unsigned int i, j, log_length = exact_log2 (length);
4862 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4863 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4865 result_chain->quick_grow (length);
4866 memcpy (result_chain->address (), dr_chain.address (),
4867 length * sizeof (tree));
4869 for (i = 0; i < nelt; ++i)
4870 sel[i] = i * 2;
4871 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4872 gcc_assert (perm_mask_even != NULL);
4874 for (i = 0; i < nelt; ++i)
4875 sel[i] = i * 2 + 1;
4876 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4877 gcc_assert (perm_mask_odd != NULL);
4879 for (i = 0; i < log_length; i++)
4881 for (j = 0; j < length; j += 2)
4883 first_vect = dr_chain[j];
4884 second_vect = dr_chain[j+1];
4886 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4887 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4888 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4889 first_vect, second_vect,
4890 perm_mask_even);
4891 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4892 (*result_chain)[j/2] = data_ref;
4894 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4895 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4896 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4897 first_vect, second_vect,
4898 perm_mask_odd);
4899 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4900 (*result_chain)[j/2+length/2] = data_ref;
4902 memcpy (dr_chain.address (), result_chain->address (),
4903 length * sizeof (tree));
4908 /* Function vect_transform_grouped_load.
4910 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4911 to perform their permutation and ascribe the result vectorized statements to
4912 the scalar statements.
4915 void
4916 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4917 gimple_stmt_iterator *gsi)
4919 vec<tree> result_chain = vNULL;
4921 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4922 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4923 vectors, that are ready for vector computation. */
4924 result_chain.create (size);
4925 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4926 vect_record_grouped_load_vectors (stmt, result_chain);
4927 result_chain.release ();
4930 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4931 generated as part of the vectorization of STMT. Assign the statement
4932 for each vector to the associated scalar statement. */
4934 void
4935 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4937 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4938 gimple next_stmt, new_stmt;
4939 unsigned int i, gap_count;
4940 tree tmp_data_ref;
4942 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4943 Since we scan the chain starting from it's first node, their order
4944 corresponds the order of data-refs in RESULT_CHAIN. */
4945 next_stmt = first_stmt;
4946 gap_count = 1;
4947 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
4949 if (!next_stmt)
4950 break;
4952 /* Skip the gaps. Loads created for the gaps will be removed by dead
4953 code elimination pass later. No need to check for the first stmt in
4954 the group, since it always exists.
4955 GROUP_GAP is the number of steps in elements from the previous
4956 access (if there is no gap GROUP_GAP is 1). We skip loads that
4957 correspond to the gaps. */
4958 if (next_stmt != first_stmt
4959 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4961 gap_count++;
4962 continue;
4965 while (next_stmt)
4967 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4968 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4969 copies, and we put the new vector statement in the first available
4970 RELATED_STMT. */
4971 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4972 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4973 else
4975 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4977 gimple prev_stmt =
4978 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4979 gimple rel_stmt =
4980 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
4981 while (rel_stmt)
4983 prev_stmt = rel_stmt;
4984 rel_stmt =
4985 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4988 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
4989 new_stmt;
4993 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
4994 gap_count = 1;
4995 /* If NEXT_STMT accesses the same DR as the previous statement,
4996 put the same TMP_DATA_REF as its vectorized statement; otherwise
4997 get the next data-ref from RESULT_CHAIN. */
4998 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4999 break;
5004 /* Function vect_force_dr_alignment_p.
5006 Returns whether the alignment of a DECL can be forced to be aligned
5007 on ALIGNMENT bit boundary. */
5009 bool
5010 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5012 if (TREE_CODE (decl) != VAR_DECL)
5013 return false;
5015 /* We cannot change alignment of common or external symbols as another
5016 translation unit may contain a definition with lower alignment.
5017 The rules of common symbol linking mean that the definition
5018 will override the common symbol. The same is true for constant
5019 pool entries which may be shared and are not properly merged
5020 by LTO. */
5021 if (DECL_EXTERNAL (decl)
5022 || DECL_COMMON (decl)
5023 || DECL_IN_CONSTANT_POOL (decl))
5024 return false;
5026 if (TREE_ASM_WRITTEN (decl))
5027 return false;
5029 /* Do not override the alignment as specified by the ABI when the used
5030 attribute is set. */
5031 if (DECL_PRESERVE_P (decl))
5032 return false;
5034 /* Do not override explicit alignment set by the user when an explicit
5035 section name is also used. This is a common idiom used by many
5036 software projects. */
5037 if (DECL_SECTION_NAME (decl) != NULL_TREE
5038 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
5039 return false;
5041 if (TREE_STATIC (decl))
5042 return (alignment <= MAX_OFILE_ALIGNMENT);
5043 else
5044 return (alignment <= MAX_STACK_ALIGNMENT);
5048 /* Return whether the data reference DR is supported with respect to its
5049 alignment.
5050 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5051 it is aligned, i.e., check if it is possible to vectorize it with different
5052 alignment. */
5054 enum dr_alignment_support
5055 vect_supportable_dr_alignment (struct data_reference *dr,
5056 bool check_aligned_accesses)
5058 gimple stmt = DR_STMT (dr);
5059 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5060 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5061 enum machine_mode mode = TYPE_MODE (vectype);
5062 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5063 struct loop *vect_loop = NULL;
5064 bool nested_in_vect_loop = false;
5066 if (aligned_access_p (dr) && !check_aligned_accesses)
5067 return dr_aligned;
5069 if (loop_vinfo)
5071 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5072 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5075 /* Possibly unaligned access. */
5077 /* We can choose between using the implicit realignment scheme (generating
5078 a misaligned_move stmt) and the explicit realignment scheme (generating
5079 aligned loads with a REALIGN_LOAD). There are two variants to the
5080 explicit realignment scheme: optimized, and unoptimized.
5081 We can optimize the realignment only if the step between consecutive
5082 vector loads is equal to the vector size. Since the vector memory
5083 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5084 is guaranteed that the misalignment amount remains the same throughout the
5085 execution of the vectorized loop. Therefore, we can create the
5086 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5087 at the loop preheader.
5089 However, in the case of outer-loop vectorization, when vectorizing a
5090 memory access in the inner-loop nested within the LOOP that is now being
5091 vectorized, while it is guaranteed that the misalignment of the
5092 vectorized memory access will remain the same in different outer-loop
5093 iterations, it is *not* guaranteed that is will remain the same throughout
5094 the execution of the inner-loop. This is because the inner-loop advances
5095 with the original scalar step (and not in steps of VS). If the inner-loop
5096 step happens to be a multiple of VS, then the misalignment remains fixed
5097 and we can use the optimized realignment scheme. For example:
5099 for (i=0; i<N; i++)
5100 for (j=0; j<M; j++)
5101 s += a[i+j];
5103 When vectorizing the i-loop in the above example, the step between
5104 consecutive vector loads is 1, and so the misalignment does not remain
5105 fixed across the execution of the inner-loop, and the realignment cannot
5106 be optimized (as illustrated in the following pseudo vectorized loop):
5108 for (i=0; i<N; i+=4)
5109 for (j=0; j<M; j++){
5110 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5111 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5112 // (assuming that we start from an aligned address).
5115 We therefore have to use the unoptimized realignment scheme:
5117 for (i=0; i<N; i+=4)
5118 for (j=k; j<M; j+=4)
5119 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5120 // that the misalignment of the initial address is
5121 // 0).
5123 The loop can then be vectorized as follows:
5125 for (k=0; k<4; k++){
5126 rt = get_realignment_token (&vp[k]);
5127 for (i=0; i<N; i+=4){
5128 v1 = vp[i+k];
5129 for (j=k; j<M; j+=4){
5130 v2 = vp[i+j+VS-1];
5131 va = REALIGN_LOAD <v1,v2,rt>;
5132 vs += va;
5133 v1 = v2;
5136 } */
5138 if (DR_IS_READ (dr))
5140 bool is_packed = false;
5141 tree type = (TREE_TYPE (DR_REF (dr)));
5143 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5144 && (!targetm.vectorize.builtin_mask_for_load
5145 || targetm.vectorize.builtin_mask_for_load ()))
5147 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5148 if ((nested_in_vect_loop
5149 && (TREE_INT_CST_LOW (DR_STEP (dr))
5150 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5151 || !loop_vinfo)
5152 return dr_explicit_realign;
5153 else
5154 return dr_explicit_realign_optimized;
5156 if (!known_alignment_for_access_p (dr))
5157 is_packed = not_size_aligned (DR_REF (dr));
5159 if ((TYPE_USER_ALIGN (type) && !is_packed)
5160 || targetm.vectorize.
5161 support_vector_misalignment (mode, type,
5162 DR_MISALIGNMENT (dr), is_packed))
5163 /* Can't software pipeline the loads, but can at least do them. */
5164 return dr_unaligned_supported;
5166 else
5168 bool is_packed = false;
5169 tree type = (TREE_TYPE (DR_REF (dr)));
5171 if (!known_alignment_for_access_p (dr))
5172 is_packed = not_size_aligned (DR_REF (dr));
5174 if ((TYPE_USER_ALIGN (type) && !is_packed)
5175 || targetm.vectorize.
5176 support_vector_misalignment (mode, type,
5177 DR_MISALIGNMENT (dr), is_packed))
5178 return dr_unaligned_supported;
5181 /* Unsupported. */
5182 return dr_unaligned_unsupported;